**Using BERT pretrained model and creating a question answer model using the BERT architecture.**

The implementation has the following sequence of steps:

* Setup BERT tekenizer
* Loading the dataset and preprocessing steps
* Create question answer model using BERT and then creating evalution(using a callback)
* Model training and evaluation

In [None]:
! pip install tokenizers



In [None]:
! pip install transformers



In [None]:
# Import required libraries
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig



In [None]:
# Define default parameters and configuration for BERT
max_length = 384
configuration = BertConfig()  

In [None]:
# Save the slow pretrained tokenizer
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
bert_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)


In [None]:
train_path = keras.utils.get_file("train.json", "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json")
val_path = keras.utils.get_file("eval.json", "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json")

In [None]:
class SquadExample:
    def __init__(self, question, context, start_char_index, answer_text, all_answer):
        self.question = question
        self.context = context
        self.start_char_index = start_char_index
        self.answer_text = answer_text
        self.all_answer = all_answer
        self.skip = False

    def preprocess(self):
        context = self.context
        question = self.question
        answer_text = self.answer_text
        start_char_index = self.start_char_index

        # Cleanning context, answer and questions
        context = " ".join(str(context).split())
        question = " ".join(str(question).split())
        answer = " ".join(str(answer_text).split())

        # Finding end character index of answers in context
        end_char_index = start_char_index + len(answer)
        if end_char_index >= len(context):
            self.skip = True
            return

        # Marking the character indexes in context of the answer
        is_char_in_ans = [0] * len(context)
        for i in range(start_char_index, end_char_index):
            is_char_in_ans[i] = 1

        # Tokenizing context
        tokenized_context = tokenizer.encode(context)

        # Finding tokens which were created from answer characters
        ans_token_index = []
        for i, (start, end) in enumerate(tokenized_context.offsets):
            if sum(is_char_in_ans[start:end]) > 0:
                ans_token_index.append(i)

        if len(ans_token_index) == 0:
            self.skip = True
            return

        # Finding start and end token index for tokens from answer
        start_token_index = ans_token_index[0]
        end_token_index = ans_token_index[-1]

        # Tokenize question
        tokenized_question = tokenizer.encode(question)

        # Create inputs
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(
            tokenized_question.ids[1:]
        )
        attention_mask = [1] * len(input_ids)

        # Pad and create attention masks.
        # Skip if truncation is needed
        padding_length = max_length - len(input_ids)
        if padding_length > 0:  # pad
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:  # skip
            self.skip = True
            return

        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.start_token_index = start_token_index
        self.end_token_index = end_token_index
        self.context_token_to_char = tokenized_context.offsets


with open(train_path) as f:
    raw_train_data = json.load(f)

with open(val_path) as f:
    raw_val_data = json.load(f)


def create_squad_example(raw_data):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                answer_text = qa["answers"][0]["text"]
                all_answer = [_["text"] for _ in qa["answers"]]
                start_char_index = qa["answers"][0]["answer_start"]
                squad_eg = SquadExample(question, context, start_char_index, answer_text, all_answer)
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
    return squad_examples


def create_inputs_target(squad_examples):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_index": [],
        "end_token_index": [],
    }
    for item in squad_examples:
        if item.skip == False:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = [dataset_dict["start_token_index"], dataset_dict["end_token_index"]]
    return x, y


train_squad_example = create_squad_example(raw_train_data)
x_train, y_train = create_inputs_target(train_squad_example)
print("{} training points created.".format(len(train_squad_example)))

val_squad_example = create_squad_example(raw_val_data)
x_val, y_val = create_inputs_target(val_squad_example)
print("{} evaluation points created.".format(len(val_squad_example)))


87599 training points created.
10570 evaluation points created.


In [None]:
def create_qa_model():
    # BERT encoder
    encoder = TFBertModel.from_pretrained("bert-base-uncased")

    #Question-Answer Model
    input_id = layers.Input(shape=(max_length,), dtype=tf.int32)
    token_type_id = layers.Input(shape=(max_length,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_length,), dtype=tf.int32)
    embedding = encoder(input_id, token_type_ids=token_type_id, attention_mask=attention_mask)[0]

    start_logit = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
    start_logit = layers.Flatten()(start_logit)

    end_logit = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
    end_logit = layers.Flatten()(end_logit)

    start_prob = layers.Activation(keras.activations.softmax)(start_logit)
    end_prob = layers.Activation(keras.activations.softmax)(end_logit)

    model = keras.Model(
        inputs=[input_id, token_type_id, attention_mask],
        outputs=[start_prob, end_prob],
    )
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])
    return model



In [None]:
use_tpu = True
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_qa_model()
else:
    model = create_qa_model()

bert_model.summary()






INFO:tensorflow:Initializing the TPU system: grpc://10.4.170.122:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.4.170.122:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_1 (TFBertModel)   TFBaseModelOutputWit 109482240   input_1[0][0]                    
                                                                 input_3[0][0]         

In [None]:

def text_normalization(txt):
    txt = txt.lower()
    # Remove punctuation
    exclude = set(string.punctuation)
    txt = "".join(ch for ch in txt if ch not in exclude)
    # Remove article
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    txt = re.sub(regex, " ", txt)
    # Remove extra white spaces
    txt = " ".join(txt.split())
    return txt


class ExactMatch(keras.callbacks.Callback):
     def __init__(self, x_val, y_val):
       self.x_val = x_val
       self.y_val = y_val

     def on_epoch_end(self, epoch, logs=None):
       pred_start, pred_end = self.model.predict(self.x_val)
       count = 0
       val_examples_no_skip = [_ for _ in val_squad_example if _.skip == False]
       for index, (start, end) in enumerate(zip(pred_start, pred_end)):
            squad_eg = val_examples_no_skip[index]
            offset = squad_eg.context_token_to_char
            start = np.argmax(start)
            end = np.argmax(end)
            if start >= len(offset):
                continue
            pred_char_start = offset[start][0]
            if end < len(offset):
                pred_char_end = offset[end][1]
                pred_ans = squad_eg.context[pred_char_start:pred_char_end]
            else:
                pred_ans = squad_eg.context[pred_char_start:]

            normalized_pred_ans = text_normalization(pred_ans)
            normalized_true_ans = [text_normalization(_) for _ in squad_eg.all_answer]
            if normalized_pred_ans in normalized_true_ans:
                count += 1
       acc = count / len(self.y_val[0])
       print(" ")
       print(f"\nepoch={epoch+1}, exact match score={acc:.2f}")



In [None]:
exact_match_callback = ExactMatch(x_val, y_val)
model.fit(
    x_train,
    y_train,
    epochs=2, 
    verbose=2,
    batch_size=64,
    callbacks=[exact_match_callback],
    )


Epoch 1/2




 

epoch=1, exact match score=0.77
1346/1346 - 307s - loss: 0.5741 - activation_6_loss: 0.3117 - activation_7_loss: 0.2624
Epoch 2/2
 

epoch=2, exact match score=0.76
1346/1346 - 307s - loss: 0.4365 - activation_6_loss: 0.2343 - activation_7_loss: 0.2021


<tensorflow.python.keras.callbacks.History at 0x7f7c7ca70198>