### Importing model & Tokenizer

In [2]:
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import torch

In [3]:
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"

In [4]:
model = BertForQuestionAnswering.from_pretrained(model_name)

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
tokenizer = BertTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

### Embeddings

In [6]:
qes = "When was the first dvd released?"
ans_doc = "The first DVD (Digital Versatile Disc) was released on March 24, 1997. It was a movie titled 'Twister' and was released in Japan."

In [7]:
encoding = tokenizer.encode_plus(text=qes, text_pair=ans_doc)
print(encoding)

{'input_ids': [101, 2043, 2001, 1996, 2034, 4966, 2207, 1029, 102, 1996, 2034, 4966, 1006, 3617, 22979, 5860, 1007, 2001, 2207, 2006, 2233, 2484, 1010, 2722, 1012, 2009, 2001, 1037, 3185, 4159, 1005, 9792, 2121, 1005, 1998, 2001, 2207, 1999, 2900, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [8]:
inputs = encoding['input_ids']
sentence_embedding = encoding['token_type_ids']
tokens = tokenizer.convert_ids_to_tokens(inputs)

In [9]:
print(tokenizer.decode(101))
print(tokenizer.decode(102))

[CLS]
[SEP]


In [10]:
output = model(input_ids = torch.tensor([inputs]), token_type_ids = torch.tensor([sentence_embedding]))

In [11]:
output

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-5.9933, -5.6337, -7.7490, -6.9855, -7.0095, -7.0431, -6.8803, -8.8654,
         -5.9933,  2.5914,  0.7486, -1.2023, -5.3193, -3.0590, -5.7897, -6.0968,
         -3.9138,  0.0109,  0.3063,  2.7610,  8.3968,  1.2349, -2.6101,  3.3470,
         -3.7737, -2.7310, -6.1388, -4.9750, -3.5969, -5.7935, -5.1681, -4.2574,
         -6.7649, -6.6634, -7.0774, -5.2035, -3.6942, -4.3716, -1.6285, -5.9940,
         -5.9933]], grad_fn=<CloneBackward0>), end_logits=tensor([[-0.5517, -6.0053, -6.4805, -7.7460, -6.9394, -5.7487, -7.1025, -6.0848,
         -0.5515, -4.4967, -3.2276, -1.3927, -6.6288, -5.2535, -4.8436, -4.4169,
         -3.3680, -4.0887, -2.8982, -2.4303, -0.2052,  1.4324,  0.3900,  9.4551,
          5.1440, -3.9077, -5.8027, -6.0257, -2.7210, -4.9631, -6.2113, -5.1053,
         -3.1412, -2.8056, -5.7051, -6.0462, -5.5627, -6.2826, -0.6443, -0.5528,
         -0.5526]], grad_fn=<CloneBackward0>), hidden_states=None, attentions=N

### Output

In [12]:
start_index = torch.argmax(output.start_logits)
end_index = torch.argmax(output.end_logits)

In [15]:
print(start_index)
print(end_index)

tensor(20)
tensor(23)


In [16]:
answer = ' '.join(tokens[start_index:end_index+1])
print(answer)

march 24 , 1997


### FAQ Chatbot

In [18]:
context_sen = "This is the context about a company that deals with computer parts. It is founded in 1990s by a programmer named SA30. It now have 3 branches and multinational brand value. It also works with selling softwares and hardware related to usb."

In [24]:
def faq_bot(que):
    context = context_sen
    input_ids = tokenizer.encode(que, context)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    num_seg_a = sep_idx + 1
    num_seg_b = len(input_ids) - num_seg_a
    segment_ids = [0] * num_seg_a + [1] * num_seg_b

    output = model(torch.tensor([input_ids]), token_type_ids = torch.tensor([segment_ids]))
    ans_start = torch.argmax(output.start_logits)
    ans_end = torch.argmax(output.end_logits)

    if ans_end >= ans_start:
        ans = ' '.join(tokens[ans_start:ans_end+1])
    else:
        print("I am unable to ans this plz ask again")
    
    corrected_ans = ""
    for word in ans.split():
        if word[0:2] == '##':
            corrected_ans += word[2:]
        else:
            corrected_ans += ' ' + word

    return corrected_ans

In [25]:
faq_bot("what does this company does?")

' deals with computer parts'