In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 46.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 42.3 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 41.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

In [2]:
import torch

## 2. Load Fine-Tuned BERT-large

In [3]:
from transformers import BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Load the tokenizer as well. 

Side note: Apparently the vocabulary of this model is identicaly to the one in bert-base-uncased. You can load the tokenizer from `bert-base-uncased` and that works just as well.

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [6]:
question = "How many parameters does BERT-large have?"
single_context = "BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance. nosense nosensenosensenosensenosens aksd;lf 'as;lddfk asd'f;lkjasdfpuihasdf546a5s4dfasdfjlaksdfj823lkajsdfp8q8uwu34  asdf9uasldkjhjq33pr98u lajsjddfp988u234lkjalsdkjdfhas24q"

In [9]:
multiple_context = [single_context[0:200],single_context[100:300],single_context[200:400],single_context[300:]]

In [10]:
multiple_context

['BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab in',
 "ameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance. nosense nosensenosensenosensenosens aksd;lf 'as;lddfk asd'f;lkjasdfpuihasdf546a5s4dfasdfjlak",
 "stance. nosense nosensenosensenosensenosens aksd;lf 'as;lddfk asd'f;lkjasdfpuihasdf546a5s4dfasdfjlaksdfj823lkajsdfp8q8uwu34  asdf9uasldkjhjq33pr98u lajsjddfp988u234lkjalsdkjdfhas24q",
 'sdfj823lkajsdfp8q8uwu34  asdf9uasldkjhjq33pr98u lajsjddfp988u234lkjalsdkjdfhas24q']

In [35]:
def answer_question(question, answer_text):
    '''
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer. Prints them out.
    '''
    # ======== Tokenize ========
    # Apply the tokenizer to the input text, treating them as a text-pair.
    input_ids = tokenizer.encode(question, answer_text)

    # Report how long the input sequence is.
    #print('Query has {:,} tokens.\n'.format(len(input_ids)))

    # ======== Set Segment IDs ========
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # ======== Evaluate ========
    # Run our example through the model.
    outputs = model(torch.tensor([input_ids]), # The tokens representing our input text.
                    token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
                    return_dict=True) 

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)
    best_start_score = torch.max(start_scores)
    best_end_score = torch.max(end_scores)

    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]

    #print('Answer: "' + answer + '"')
    return answer, best_start_score, best_end_score

In [24]:
answer, best_start_score, best_end_score = answer_question(question, single_context)

In [28]:
print(answer + "  " + str(best_start_score) + "  " + str(best_end_score))

340m  tensor(5.7587, grad_fn=<MaxBackward1>)  tensor(4.6056, grad_fn=<MaxBackward1>)


In [32]:
best_start_score > best_end_score

tensor(True)

In [41]:
BestTotalScore = 0
FinalAnswer = "nothing"
for context_part in multiple_context:
  print("--------------------")
  print(context_part)
  answer, best_start_score, best_end_score = answer_question(question, context_part)
  print(best_start_score)
  print(best_end_score)
  print(answer)
  NewScore = best_start_score + best_end_score
  print(NewScore)
  if(NewScore > BestTotalScore):
    FinalAnswer = answer
    BestTotalScore = NewScore
  else:
    print("No answer update")

--------------------
BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab in
Answer: "340m"
tensor(6.2076, grad_fn=<MaxBackward1>)
tensor(5.2094, grad_fn=<MaxBackward1>)
340m
tensor(11.4170, grad_fn=<AddBackward0>)
--------------------
ameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance. nosense nosensenosensenosensenosens aksd;lf 'as;lddfk asd'f;lkjasdfpuihasdf546a5s4dfasdfjlak
Answer: "1 . 34gb"
tensor(4.1590, grad_fn=<MaxBackward1>)
tensor(4.2650, grad_fn=<MaxBackward1>)
1 . 34gb
tensor(8.4240, grad_fn=<AddBackward0>)
No answer update
--------------------
stance. nosense nosensenosensenosensenosens aksd;lf 'as;lddfk asd'f;lkjasdfpuihasdf546a5s4dfasdfjlaksdfj823lkajsdfp8q8uwu34  asdf9uasldkjhjq33pr98u lajsjddfp988u234lkjalsdkjdfhas24q
Answer: "##24"
tensor(-2.9170, grad_fn=<MaxBackward1>

In [42]:
FinalAnswer

'340m'