In [1]:
! pip install transformers

%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m39.0 MB/s[0m eta [36m0:00:0

## Fine-tuning BERT
- BERT Pre-Training phase: Masked Language Model(MLM) and Next Sentence Prediction(NSP)

- Then, finetune all of the weights in the pretrained model for a task of choice.

- Source: https://arxiv.org/pdf/1810.04805.pdf

<img src="https://i.imgur.com/fKNAhKH.png" alt=" " width="75%" height="75%">



##  QuestionAnswering Model (inbuilt)
- Pretraining(NSP, MLM) + SQuAD Dataset trained model
- Extractive Model
- SQuAD: https://rajpurkar.github.io/SQuAD-explorer/

- "SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but also determine when no answer is supported by the paragraph and abstain from answering."



In [2]:
# Using HuggingFace's QuestionAnsweringModel (inbuilt)

import tensorflow as tf

# Refer: https://huggingface.co/transformers/model_doc/bert.html

from transformers import BertTokenizer, TFBertForQuestionAnswering

modelName = 'bert-large-uncased-whole-word-masking-finetuned-squad' # https://huggingface.co/transformers/pretrained_models.html

tokenizer = BertTokenizer.from_pretrained(modelName)
model = TFBertForQuestionAnswering.from_pretrained(modelName)

print(model)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

All the weights of TFBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


<transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering object at 0x7cb4e5efb7f0>


In [3]:
# Text+Q&A from SQuAD Test data: https://rajpurkar.github.io/SQuAD-explorer/explore/v2.0/dev/Amazon_rainforest.html?model=nlnet%20(single%20model)%20(Microsoft%20Research%20Asia)&version=v2.0
text = r"""The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain "Amazonas" in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species."""
question =r"""What percentage does the Amazon represents in rainforests on the planet?"""

#question = r"""How many nations contain "Amazonas" in their names?"""


#input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
input_text =  question + " [SEP] " + text
input_ids = tokenizer.encode(input_text)

print(len(input_ids))
print(tokenizer.decode(input_ids))

input = tf.constant(input_ids)[None, :]  # Batch size 1


250
[CLS] what percentage does the amazon represents in rainforests on the planet? [SEP] the amazon rainforest ( portuguese : floresta amazonica or amazonia ; spanish : selva amazonica, amazonia or usually amazonia ; french : foret amazonienne ; dutch : amazoneregenwoud ), also known in english as amazonia or the amazon jungle, is a moist broadleaf forest that covers most of the amazon basin of south america. this basin encompasses 7, 000, 000 square kilometres ( 2, 700, 000 sq mi ), of which 5, 500, 000 square kilometres ( 2, 100, 000 sq mi ) are covered by the rainforest. this region includes territory belonging to nine nations. the majority of the forest is contained within brazil, with 60 % of the rainforest, followed by peru with 13 %, colombia with 10 %, and with minor amounts in venezuela, ecuador, bolivia, guyana, suriname and french guiana. states or departments in four nations contain " amazonas " in their names. the amazon represents over half of the planet's remaining rainf

In [4]:
# Source: Modified PyTorch code from https://www.kaggle.com/c/tensorflow2-question-answering/discussion/123434

#[SEP] =  102 (token_id)
# token_type_ids has 0 corresponding to the qeustion part of the input_text and 1 corresponding to the
# text/part

token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]

print(token_type_ids)


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [13]:
answer = model(input, token_type_ids = tf.convert_to_tensor([token_type_ids]))

print(type(answer))
print(len(answer))
startScores = answer[0]
endScores = answer[1]
# (startScores, endScores) = answer
# startScores = tf.squeeze(startScores, axis=0)
print(startScores)
print(startScores.shape)
print(endScores.shape)


<class 'transformers.modeling_tf_outputs.TFQuestionAnsweringModelOutput'>
2
tf.Tensor(
[[-6.047484   -4.6560254  -7.5719843  -7.8129907  -7.0066047  -7.894754
  -8.126448   -7.5400662  -8.892705   -9.418996   -8.942823   -8.615168
  -9.496941   -9.665026   -6.047459   -6.15179    -3.5666304  -7.0746512
  -8.149384   -6.5617323  -8.559913   -7.116378   -8.553457   -7.5595984
  -8.405012   -8.682978   -6.790609   -8.405869   -8.498199   -6.2351665
  -8.77474    -7.628984   -8.575293   -7.887277   -8.571488   -8.881729
  -7.3598747  -8.744015   -8.553473   -6.743695   -7.2684555  -8.534404
  -8.565098   -6.48624    -8.576065   -6.986377   -8.542221   -7.4793887
  -8.858679   -8.480019   -8.771893   -6.9176674  -8.668295   -7.195728
  -8.835161   -8.785322   -8.765894   -8.421244   -7.5677586  -7.9901147
  -7.5482955  -7.821805   -8.122395   -7.256473   -8.404978   -6.545388
  -8.211746   -8.552202   -7.2628655  -6.0843143  -7.75927    -8.038822
  -7.137994   -7.301934   -7.178516   -7.642

In [14]:
input_tokens = tokenizer.convert_ids_to_tokens(input_ids)

print(input_tokens)

['[CLS]', 'what', 'percentage', 'does', 'the', 'amazon', 'represents', 'in', 'rainforest', '##s', 'on', 'the', 'planet', '?', '[SEP]', 'the', 'amazon', 'rainforest', '(', 'portuguese', ':', 'flores', '##ta', 'amazon', '##ica', 'or', 'amazon', '##ia', ';', 'spanish', ':', 'se', '##lva', 'amazon', '##ica', ',', 'amazon', '##ia', 'or', 'usually', 'amazon', '##ia', ';', 'french', ':', 'fore', '##t', 'amazon', '##ien', '##ne', ';', 'dutch', ':', 'amazon', '##ere', '##gen', '##wo', '##ud', ')', ',', 'also', 'known', 'in', 'english', 'as', 'amazon', '##ia', 'or', 'the', 'amazon', 'jungle', ',', 'is', 'a', 'moist', 'broad', '##leaf', 'forest', 'that', 'covers', 'most', 'of', 'the', 'amazon', 'basin', 'of', 'south', 'america', '.', 'this', 'basin', 'encompasses', '7', ',', '000', ',', '000', 'square', 'kilometres', '(', '2', ',', '700', ',', '000', 'sq', 'mi', ')', ',', 'of', 'which', '5', ',', '500', ',', '000', 'square', 'kilometres', '(', '2', ',', '100', ',', '000', 'sq', 'mi', ')', 'are', 

In [15]:
print(startScores)


tf.Tensor(
[[-6.047484   -4.6560254  -7.5719843  -7.8129907  -7.0066047  -7.894754
  -8.126448   -7.5400662  -8.892705   -9.418996   -8.942823   -8.615168
  -9.496941   -9.665026   -6.047459   -6.15179    -3.5666304  -7.0746512
  -8.149384   -6.5617323  -8.559913   -7.116378   -8.553457   -7.5595984
  -8.405012   -8.682978   -6.790609   -8.405869   -8.498199   -6.2351665
  -8.77474    -7.628984   -8.575293   -7.887277   -8.571488   -8.881729
  -7.3598747  -8.744015   -8.553473   -6.743695   -7.2684555  -8.534404
  -8.565098   -6.48624    -8.576065   -6.986377   -8.542221   -7.4793887
  -8.858679   -8.480019   -8.771893   -6.9176674  -8.668295   -7.195728
  -8.835161   -8.785322   -8.765894   -8.421244   -7.5677586  -7.9901147
  -7.5482955  -7.821805   -8.122395   -7.256473   -8.404978   -6.545388
  -8.211746   -8.552202   -7.2628655  -6.0843143  -7.75927    -8.038822
  -7.137994   -7.301934   -7.178516   -7.6427236  -8.35898    -7.066742
  -8.150646   -7.197265   -6.813742   -8.397958 

In [16]:
print(tf.math.argmax(startScores[0],0)) # https://www.tensorflow.org/api_docs/python/tf/math/argmax


tf.Tensor(207, shape=(), dtype=int64)


In [17]:
print((tf.math.argmax(startScores[0],0)).numpy())

207


In [18]:
startIdx = tf.math.argmax(startScores[0],0).numpy()
endIdx = tf.math.argmax(endScores[0],0).numpy()+1
print(startIdx,endIdx)

207 209


In [19]:
print(" ".join(input_tokens[startIdx:endIdx]))

over half


## Additional Resources
- Mobile BERT QA System (code in pure TF/Keras):  https://www.tensorflow.org/lite/models/bert_qa/overview

- https://towardsdatascience.com/testing-bert-based-question-answering-on-coronavirus-articles-13623637a4ff

- Finetune on custom-data: https://huggingface.co/transformers/examples.html#squad