In [79]:
%config Completer.use_jedi = False
 
import torch
# !pip install transformers
# !pip install transformers==3

In [80]:
import pandas as pd

In [105]:
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

#### Reference
##### https://mccormickml.com/2020/03/10/question-answering-with-a-fine-tuned-BERT/#bert-input-format

<!-- To get pretrained Models -->

### To get pretrained Models
https://huggingface.co/transformers/pretrained_models.html

#### tokenizer_base = BertTokenizer.from_pretrained('bert-base-uncased')
#### model_base = BertModel.from_pretrained("bert-base-uncased")

In [106]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
# Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

### Datasets for Question Answering
#### https://analyticsindiamag.com/10-question-answering-datasets-to-build-robust-chatbot-systems/
#### https://lionbridge.ai/datasets/15-best-chatbot-datasets-for-machine-learning/

In [107]:
sample_question='What am I losing when using extension tubes instead of a macro lens?'
# print('sample_questions\n',sample_question)
sample_paragraph='''After playing around with macro photography on-the-cheap (read: reversed lens, rev. lens mounted on a straight lens, passive extension tubes), I would like to get further with this. The problems with the techniques I used is that focus is manual and aperture control is problematic at best. This limited my setup to still subjects (read: dead insects) Now, as spring is approaching, I want to be able to shoot live insects. I believe that for this, autofocus and settable aperture will be of great help.

So, one obvious but expensive option is a macro lens (say, EF 100mm Macro) However, I am not really interested in yet another prime lens. An alternative is the electrical extension tubes.

Except for maximum focusing distance, what am I losing when using tubes (coupled with a fine lens, say EF70-200/2.8) instead of a macro lens?
'''

# print('sample_paragraph\n',sample_paragraph)

In [65]:
tokens_id=tokenizer.encode(sample_question,sample_paragraph)
print(len(tokens_id))

204


## Converting Tokens to IDs
When the BERT model was trained, each token was given a unique ID. Hence, when we want to use a pre-trained BERT model, we will first need to convert each token in the input sentence into its corresponding unique IDs.

There is an important point to note when we use a pre-trained model. Since the model is pre-trained on a certain corpus, the vocabulary was also fixed. In other words, when we apply a pre-trained model to some other data, it is possible that some tokens in the new data might not appear in the fixed vocabulary of the pre-trained model. This is commonly known as the out-of-vocabulary (OOV) problem.

For tokens not appearing in the original vocabulary, it is designed that they should be replaced with a special token [UNK], which stands for unknown token.

However, converting all unseen tokens into [UNK] will take away a lot of information from the input data. Hence, BERT makes use of a WordPiece algorithm that breaks a word into several subwords, such that commonly seen subwords can also be represented by the mode

In [66]:
tokens=tokenizer.convert_ids_to_tokens(tokens_id)
# print(tokens)
# For each token and its id...
for token, id in zip(tokens, tokens_id):
    
    # If this is the [SEP] token, add some space around it to make it stand out.
    if id == tokenizer.sep_token_id:
        print('')
    
    # Print the token string and its ID in two columns.
    print('{:<12} {:>6,}'.format(token, id))

    if id == tokenizer.sep_token_id:
        print('')


[CLS]           101
what          2,054
am            2,572
i             1,045
losing        3,974
when          2,043
using         2,478
extension     5,331
tubes        10,868
instead       2,612
of            1,997
a             1,037
macro        26,632
lens         10,014
?             1,029

[SEP]           102

after         2,044
playing       2,652
around        2,105
with          2,007
macro        26,632
photography   5,855
on            2,006
-             1,011
the           1,996
-             1,011
cheap        10,036
(             1,006
read          3,191
:             1,024
reversed     11,674
lens         10,014
,             1,010
rev           7,065
.             1,012
lens         10,014
mounted       5,614
on            2,006
a             1,037
straight      3,442
lens         10,014
,             1,010
passive      13,135
extension     5,331
tubes        10,868
)             1,007
,             1,010
i             1,045
would         2,052
like          2,06

We've concatenated the question and answer_text together, but BERT still needs a way to distinguish them. BERT has two special "Segment" embeddings, one for segment "A" and one for segment "B". Before the word embeddings go into the BERT layers, the segment A embedding needs to be added to the question tokens, and the segment B embedding needs to be added to each of the answer_text tokens.

These additions are handled for us by the transformer library, and all we need to do is specify a '0' or '1' for each token.

Note: In the transformers library, huggingface likes to call these token_type_ids, but I'm going with segment_ids since this seems clearer, and is consistent with the BERT paper.



In [68]:
sep_index = tokens_id.index(tokenizer.sep_token_id)

# The number of segment A tokens includes the [SEP] token istelf.
num_seg_a = sep_index + 1

# The remainder are segment B.
num_seg_b = len(tokens_id) - num_seg_a

# Construct the list of 0s and 1s.
segment_ids = [0]*num_seg_a + [1]*num_seg_b

# There should be a segment_id for every input token.
assert len(segment_ids) == len(tokens_id)
# segment_ids

In [101]:
 output= model(torch.tensor([tokens_id]), # The tokens representing our input text.
                                 token_type_ids=torch.tensor([segment_ids])) # The segment IDs to differentiate question from answer_text
print(output)

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-5.2245, -4.3151, -7.1407, -7.5972, -7.2100, -8.0673, -8.5111, -7.7792,
         -8.1569, -9.1126, -9.0503, -8.8063, -8.0988, -9.0621, -9.6507, -5.2245,
         -6.3640, -6.6925, -7.9785, -7.9345, -3.5484, -4.8954, -7.7344, -8.7217,
         -8.1553, -8.1698, -6.5690, -6.7664, -5.8297, -7.9816, -4.8527, -7.1700,
         -8.5508, -5.3767, -8.7314, -6.4992, -8.0076, -8.1405, -8.0837, -6.7332,
         -5.4148, -7.9432, -3.4901, -4.1135, -4.3120, -8.1806, -7.9794, -5.9123,
         -7.9929, -8.0603, -8.2150, -7.8103, -7.7710, -7.9825, -6.6707, -7.5217,
         -5.8491, -5.6869, -8.6547, -7.9563, -6.5744, -8.4726, -7.6087, -8.2721,
         -6.2865, -0.1566, -8.1714, -4.6211, -7.8546, -0.1980, -5.1878, -8.2429,
         -5.8505, -8.8104, -6.8585, -8.4170, -6.8890, -5.8871, -7.7531, -7.3217,
         -8.6676, -7.0777, -6.5268, -7.8989, -6.3930, -8.4051, -5.4212, -5.9651,
         -8.1715, -6.9076, -8.1394, -6.9000, -6.4994, -8

In [104]:
# Find the tokens with the highest `start` and `end` scores.
# tokens_id[]
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)

# Combine the tokens in the answer and print it out.
answer = ' '.join(tokens[answer_start:answer_end+1])

print('Answer: "' + answer + '"')

Answer: "maximum focusing distance"
