In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import matplotlib.pyplot as plt
import seaborn as sns
import json

### **Load SQuAD dataset**

In [2]:
def load_dataset():
  train_path = tf.keras.utils.get_file("train.json", "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json")
  eval_path = tf.keras.utils.get_file("eval.json", "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json")
  with open(train_path) as f: raw_train_data = json.load(f)
  with open(eval_path) as f: raw_eval_data = json.load(f)
  return raw_train_data,raw_eval_data

In [3]:
raw_train_data, raw_eval_data = load_dataset()

Downloading data from https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
Downloading data from https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json


### **Load BERT model**

**Loading of BERT layer from tensorflow hub, and generating the pooled output, sequence output and vocab file from bert layer for question-answering task**

In [4]:
import tensorflow_hub as hub

def load_BERT(url,max_seq_length,trainable=True):
  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
  input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
  segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='segment_ids')

  bert_layer = hub.KerasLayer(url, trainable=True)

  # pooled output has shape (batch size, embedding dim) which is an embedding of the [CLS] token and represents entire sequence
  # sequence output has shape (batch size, max sequence length, embedding dim) which has representation for each token 

  pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
  vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode("utf-8")
  to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

  return vocab_file,bert_layer,pooled_output,sequence_output,input_word_ids, input_mask, segment_ids

In [5]:
url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2"
max_seq_length = 399

vocab_file,bert_layer,pooled_output,sequence_output,input_word_ids, input_mask, segment_ids = load_BERT(url,max_seq_length,True)

### **Creating the fine tuned BERT model**

In [6]:
from keras import layers

In [7]:
class ModelParams:
  def __init__(self,learning_rate=1e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9):
    self.loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    self.optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon)
    self.learning_rate = learning_rate
    self.beta_1 = beta_1
    self.beta_2 = beta_2
    self.epsilon = epsilon

In [8]:
params = ModelParams()

In [9]:
def get_FineTunedBERT():

  # start and end logits
  start_logits = layers.Dense(1, name="start_logit", use_bias=False)(sequence_output)
  start_logits = layers.Flatten()(start_logits)
  end_logits = layers.Dense(1, name="end_logit", use_bias=False)(sequence_output)
  end_logits = layers.Flatten()(end_logits)

  start_probs = layers.Activation(keras.activations.softmax)(start_logits)
  end_probs = layers.Activation(keras.activations.softmax)(end_logits)
  model = tf.keras.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[start_probs, end_probs])
  loss = params.loss
  optimizer = params.optimizer
  model.compile(optimizer=optimizer, loss=[loss, loss])

  return model

In [10]:
model = get_FineTunedBERT()

In [11]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 399)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 399)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 399)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 399, 768)]                'input_mask[0][0]',         

### **Load BERT tokenizer**

In [12]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 30.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 67.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 62.4 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling 

In [13]:
from tokenizers import BertWordPieceTokenizer

def load_tokenizer(vocab_file):
  tokenizer = BertWordPieceTokenizer(vocab=vocab_file, lowercase=True)
  return tokenizer

In [14]:
tokenizer = load_tokenizer(vocab_file)

### **SQuAD JSON data format**
* #### {'data' : {'title' : '',['paragraphs' : [{'context' : '', 'qas' : [{'answers' : [{'answer_start' : '','text' : ''}],'id' : '', 'question' : ''},...]},...],...]}}

In [15]:
# format:
# {'data' : {'title' : '',['paragraphs' : [{'context' : '', 'qas' : [{'answers' : [{'answer_start' : '','text' : ''}],'id' : '', 'question' : ''},...]},...],...]}}

## **Creating the training sample prototype**

In [16]:
class Sample:
  def __init__(self, question, context, start_char_idx=None, answer_text=None, all_answers=None):
    self.question = question
    self.context = context
    self.start_char_idx = start_char_idx
    self.end_char_idx = -1
    self.answer_text = answer_text
    self.skip = False
    self.start_token_idx = -1
    self.end_token_idx = -1
    self.max_seq_length = max_seq_length
    self.padding_length = 10
    self.all_answers = all_answers

  def get_tokens(self):
    context = " ".join(str(self.context).split())
    question = " ".join(str(self.question).split())

    tokenized_context = tokenizer.encode(context)
    tokenized_question = tokenizer.encode(question)

    return (context,question),(tokenized_context,tokenized_question)

  def get_ids(self,tokenized_context,tokenized_question):
    input_ids = tokenized_context.ids + tokenized_question.ids[1:]
    seg_ids = [0] * len(tokenized_context.ids) + [1] * len(tokenized_question.ids[1:])
    mask = [1] * len(input_ids)
    self.padding_length = self.max_seq_length - len(input_ids)
    return (input_ids,seg_ids,mask)

  def preprocess(self):

    # getting the tokenized text
    (context,question),(tokenized_context,tokenized_question) = self.get_tokens()

    if self.answer_text is not None:
      answer = " ".join(str(self.answer_text).split())

      # calculating end character index
      self.end_char_idx = self.start_char_idx + len(answer)
      if self.end_char_idx >= len(context):
          self.skip = True
          return
    
      is_char_in_ans = [0] * len(context)
      for idx in range(self.start_char_idx, self.end_char_idx):
          is_char_in_ans[idx] = 1
      ans_token_idx = []

      # finding the relevant tokens present in the answer
      for idx, (start, end) in enumerate(tokenized_context.offsets):
          if sum(is_char_in_ans[start:end]) > 0:
              ans_token_idx.append(idx)
      if len(ans_token_idx) == 0:
          self.skip = True
          return

      self.start_token_idx = ans_token_idx[0]
      self.end_token_idx = ans_token_idx[-1]

    # getting the ids necessary for BERT input
    (input_ids,seg_ids,mask) = self.get_ids(tokenized_context,tokenized_question)

    # adding necessary padding 
    if self.padding_length > 0:
        input_ids = input_ids + ([0] * self.padding_length)
        mask = mask + ([0] * self.padding_length)
        seg_ids = seg_ids + ([0] * self.padding_length)
    elif self.padding_length < 0:
        self.skip = True
        return

    self.input_word_ids = input_ids
    self.segment_ids = seg_ids
    self.input_mask = mask
    self.context_token_to_char = tokenized_context.offsets

### **Creating the training and testing examples**

In [17]:
def create_examples(data):
    examples = []
    for item in data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qas in para["qas"]:
                question = qas["question"]
                if "answers" in qas:
                    answer_text = qas["answers"][0]["text"]
                    start_char_idx = qas["answers"][0]["answer_start"]
                    all_answers = [_["text"] for _ in qas["answers"]]
                    sample = Sample(question, context, start_char_idx, answer_text,all_answers)
                else:
                    sample = Sample(question, context)

                # preprocess each sample
                sample.preprocess()
                examples.append(sample)
    return examples

### **Creating the data and target pairs**

In [18]:
def create_data_target_pairs(examples):
    dataset_dict = {
        "input_word_ids": [],
        "segment_ids": [],
        "input_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in examples:
        if item.skip == False:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])
    x = [dataset_dict["input_word_ids"],
         dataset_dict["input_mask"],
         dataset_dict["segment_ids"]]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y

In [19]:
train_examples = create_examples(raw_train_data)
x_train, y_train = create_data_target_pairs(train_examples)

test_examples = create_examples(raw_eval_data)
x_test, y_test = create_data_target_pairs(test_examples)

In [20]:
print("Shape of training examples created : ",np.array(x_train).shape," , ",np.array(y_train).shape)
print("Shape of training examples created : ",np.array(x_test).shape," , ",np.array(y_test).shape)

Shape of training examples created :  (3, 86299, 399)  ,  (2, 86299)
Shape of training examples created :  (3, 10349, 399)  ,  (2, 10349)


### **Training the fine tuned BERT model**

**Custom callback**

In [21]:
import re,string

class ValidationCallback(keras.callbacks.Callback):

    def normalize_text(self, text):
        # convert to lower case
        text = text.lower()
        # remove redundant whitespaces
        text = "".join(ch for ch in text if ch not in set(string.punctuation))
        # remove articles
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        text = re.sub(regex, " ", text)
        text = " ".join(text.split())
        return text

    def __init__(self, x_eval, y_eval):
        self.x_eval = x_eval
        self.y_eval = y_eval

    def on_epoch_end(self, epoch, logs=None):
        # get the offsets of the first and last tokens of predicted answers
        pred_start, pred_end = self.model.predict(self.x_eval)
        count = 0
        valid_test_examples = [_ for _ in test_examples if _.skip == False]
        # for every pair of offsets
        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            # take the required Sample object with the ground-truth answers in it
            example = valid_test_examples[idx]
            # use offsets to get back the span of text corresponding to
            # our predicted first and last tokens
            offsets = example.context_token_to_char
            start = np.argmax(start)
            end = np.argmax(end)
            if start >= len(offsets):
                continue
            pred_char_start = offsets[start][0]
            if end < len(offsets):
                pred_char_end = offsets[end][1]
                pred_ans = example.context[pred_char_start:pred_char_end]
            else:
                pred_ans = example.context[pred_char_start:]
            normalized_pred_ans = self.normalize_text(pred_ans)
            # clean the real answers
            normalized_true_ans = [self.normalize_text(_) for _ in example.all_answers]
            # check if the predicted answer is in an array of the ground-truth answers
            if normalized_pred_ans in normalized_true_ans:
                count += 1
        acc = count / len(self.y_eval[0])
        print(f"\nepoch={epoch + 1}, exact match score={acc:.2f}")

In [22]:
history = model.fit(x_train, y_train, epochs=2, batch_size=8, callbacks=[ValidationCallback(x_test, y_test)])

Epoch 1/2
epoch=1, exact match score=0.77
Epoch 2/2
epoch=2, exact match score=0.79


In [23]:
model.save_weights("./weights.h5")

### **Testing on unknown data**

In [59]:
model = get_FineTunedBERT()
model.load_weights('weights.h5')

In [60]:
def get_predicted_answers(data):
  test_samples = create_examples(data)
  x_test, _ = create_data_target_pairs(test_samples)
  pred_start, pred_end = model.predict(x_test)
  answers = []
  questions = []
  for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
      test_sample = test_samples[idx]
      offsets = test_sample.context_token_to_char
      start = np.argmax(start)
      end = np.argmax(end)
      pred_ans = None
      if start >= len(offsets):
          continue
      pred_char_start = offsets[start][0]
      if end < len(offsets):
          pred_ans = test_sample.context[pred_char_start:offsets[end][1]]
      else:
          pred_ans = test_sample.context[pred_char_start:]
      questions.append(test_sample.question)
      answers.append(pred_ans)
  return questions,answers

In [61]:
def display_question_answers(questions,answers):
  for question,answer in zip(questions,answers):
    print("Q: " + question)
    print("A: " + answer)

In [62]:
c1 = '''Shah Rukh Khan (pronounced [ˈʃɑːɦɾʊx xɑːn]; born 2 November 1965), also known by the initialism SRK, is an Indian actor, film producer, and television personality who works in Hindi films. Referred to in the media as the "Baadshah of Bollywood" (in reference to his 1999 film Baadshah), "King of Bollywood" and "King Khan", he has appeared in more than 80 films, and earned numerous accolades, including 14 Filmfare Awards. The Government of India has awarded him the Padma Shri, and the Government of France has awarded him the Ordre des Arts et des Lettres and the Legion of Honour. Khan has a significant following in Asia and the Indian diaspora worldwide. In terms of audience size and income, he has been described as one of the most successful film stars in the world.'''

In [63]:
data1 = {"data":
    [
        {"title": "Shah Rukh Khan",
         "paragraphs": [
             {
                 "context": c1,
                 "qas": [
                     {"question": "What name is Shah Rukh Khan refered to the media as?",
                      "id": "Q1"
                      },
                     {"question": "How many films did he appear in?",
                      "id": "Q2"
                      }
                 ]}]}]}

questions,answers = get_predicted_answers(data1)
display_question_answers(questions,answers)

Q: What name is Shah Rukh Khan refered to the media as?
A: Baadshah of Bollywood
Q: How many films did he appear in?
A: more than 80


In [64]:
c2 = '''As of 2015, Khan is co-chairman of the motion picture production company Red Chillies Entertainment and its subsidiaries and is the co-owner of the Indian Premier League cricket team Kolkata Knight Riders and the Caribbean Premier League team Trinbago Knight Riders. He is a frequent television presenter and stage show performer. The media often label him as "Brand SRK" because of his many endorsement and entrepreneurship ventures. Khan's philanthropic endeavours have provided health care and disaster relief, and he was honoured with UNESCO's Pyramide con Marni award in 2011 for his support of children's education and the World Economic Forum's Crystal Award in 2018 for his leadership in championing women's and children's rights in India. He regularly features in listings of the most influential people in Indian culture, and in 2008, Newsweek named him one of their fifty most powerful people in the world.'''

In [65]:
data2 = {"data":
    [
        {"title": "Shah Rukh Khan",
         "paragraphs": [
             {
                 "context": c2,
                 "qas": [
                     {"question": "Shah Rukh Khan is the chairman of which production company?",
                      "id": "Q1"
                      },
                     {"question": "He was awarded which award by UNESCO?",
                      "id": "Q2"
                      }
                 ]}]}]}

questions,answers = get_predicted_answers(data2)
display_question_answers(questions,answers)

Q: Shah Rukh Khan is the chairman of which production company?
A: Red Chillies Entertainment
Q: He was awarded which award by UNESCO?
A: Pyramide con Marni award
