In [11]:
# %tensorflow_version 2.x
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

Tensorflow version 2.2.0
Running on TPU  ['10.0.0.2:8470']


In [12]:
!pip install tokenizer
!pip install transformers

Collecting tokenizer
  Downloading tokenizer-2.4.0-py2.py3-none-any.whl (105 kB)
[K     |████████████████████████████████| 105 kB 2.8 MB/s 
[?25hInstalling collected packages: tokenizer
Successfully installed tokenizer-2.4.0
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [13]:
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig

max_len = 384
configuration = BertConfig() 



In [14]:
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [15]:
train_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
train_path = keras.utils.get_file("train.json", train_data_url)
eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
eval_path = keras.utils.get_file("eval.json", eval_data_url)

Downloading data from https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
Downloading data from https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json


In [16]:
class SquadExample:
    def __init__(self, question, context, start_char_idx, answer_text, all_answers):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.skip = False

    def preprocess(self):
        context = self.context
        question = self.question
        answer_text = self.answer_text
        start_char_idx = self.start_char_idx

        # Clean context, answer and question
        context = " ".join(str(context).split())
        question = " ".join(str(question).split())
        answer = " ".join(str(answer_text).split())

        # Find end character index of answer in context
        end_char_idx = start_char_idx + len(answer)
        if end_char_idx >= len(context):
            self.skip = True
            return

        # Mark the character indexes in context that are in answer
        is_char_in_ans = [0] * len(context)
        for idx in range(start_char_idx, end_char_idx):
            is_char_in_ans[idx] = 1

        # Tokenize context
        tokenized_context = tokenizer.encode(context)

        # Find tokens that were created from answer characters
        ans_token_idx = []
        for idx, (start, end) in enumerate(tokenized_context.offsets):
            if sum(is_char_in_ans[start:end]) > 0:
                ans_token_idx.append(idx)

        if len(ans_token_idx) == 0:
            self.skip = True
            return

        # Find start and end token index for tokens from answer
        start_token_idx = ans_token_idx[0]
        end_token_idx = ans_token_idx[-1]

        # Tokenize question
        tokenized_question = tokenizer.encode(question)

        # Create inputs
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(
            tokenized_question.ids[1:]
        )
        attention_mask = [1] * len(input_ids)

        # Pad and create attention masks.
        # Skip if truncation is needed
        padding_length = max_len - len(input_ids)
        if padding_length > 0:  # pad
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:  # skip
            self.skip = True
            return

        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.start_token_idx = start_token_idx
        self.end_token_idx = end_token_idx
        self.context_token_to_char = tokenized_context.offsets


with open(train_path) as f:
    raw_train_data = json.load(f)

with open(eval_path) as f:
    raw_eval_data = json.load(f)


def create_squad_examples(raw_data):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                answer_text = qa["answers"][0]["text"]
                all_answers = [_["text"] for _ in qa["answers"]]
                start_char_idx = qa["answers"][0]["answer_start"]
                squad_eg = SquadExample(
                    question, context, start_char_idx, answer_text, all_answers
                )
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
    return squad_examples


def create_inputs_targets(squad_examples):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in squad_examples:
        if item.skip == False:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y


train_squad_examples = create_squad_examples(raw_train_data)
x_train, y_train = create_inputs_targets(train_squad_examples[:20000])
print(f"{len(train_squad_examples)} training points created.")

eval_squad_examples = create_squad_examples(raw_eval_data)
x_eval, y_eval = create_inputs_targets(eval_squad_examples)
print(f"{len(eval_squad_examples)} evaluation points created.")

print(len(x_train), len(x_eval))

87599 training points created.
10570 evaluation points created.
3 3


In [17]:
len(train_squad_examples[:50000])

50000

In [18]:
def normalize_text(text):
    text = text.lower()

    # Remove punctuations
    exclude = set(string.punctuation)
    text = "".join(ch for ch in text if ch not in exclude)

    # Remove articles
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    text = re.sub(regex, " ", text)

    # Remove extra white space
    text = " ".join(text.split())
    return text


class ExactMatch(keras.callbacks.Callback):
    """
    Each `SquadExample` object contains the character level offsets for each token
    in its input paragraph. We use them to get back the span of text corresponding
    to the tokens between our predicted start and end tokens.
    All the ground-truth answers are also present in each `SquadExample` object.
    We calculate the percentage of data points where the span of text obtained
    from model predictions matches one of the ground-truth answers.
    """

    def __init__(self, x_eval, y_eval):
        self.x_eval = x_eval
        self.y_eval = y_eval

    def on_epoch_end(self, epoch, logs=None):
        pred_start, pred_end = self.model.predict(self.x_eval)
        count = 0
        eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]
        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            squad_eg = eval_examples_no_skip[idx]
            offsets = squad_eg.context_token_to_char
            start = np.argmax(start)
            end = np.argmax(end)
            if start >= len(offsets):
                continue
            pred_char_start = offsets[start][0]
            if end < len(offsets):
                pred_char_end = offsets[end][1]
                pred_ans = squad_eg.context[pred_char_start:pred_char_end]
            else:
                pred_ans = squad_eg.context[pred_char_start:]

            normalized_pred_ans = normalize_text(pred_ans)
            normalized_true_ans = [normalize_text(_) for _ in squad_eg.all_answers]
            if normalized_pred_ans in normalized_true_ans:
                count += 1
        acc = count / len(self.y_eval[0])
        print(f"\nepoch={epoch+1}, exact match score={acc:.2f}")
exact_match_callback = ExactMatch(x_eval, y_eval)

In [19]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [20]:
def create_model():
    ## BERT encoder
    encoder = TFBertModel.from_pretrained("bert-base-uncased")

    ## QA Model
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    embedding = encoder(
        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )[0]

    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
    start_logits = layers.Flatten()(start_logits)

    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
    end_logits = layers.Flatten()(end_logits)

    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
    end_probs = layers.Activation(keras.activations.softmax)(end_logits)

    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])
    return model

In [21]:
model = create_model()

model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 384, 768), ( 109482240   input_1[0][0]                    
_____________________________________________________________________________________________

In [22]:
model.fit(
    x_train,
    y_train,
    epochs=3,
    verbose=1,
    batch_size=64,
    callbacks=[exact_match_callback],
)

Epoch 1/3
epoch=1, exact match score=0.69
Epoch 2/3
epoch=2, exact match score=0.70
Epoch 3/3
epoch=3, exact match score=0.71


<tensorflow.python.keras.callbacks.History at 0x7f34e7b2cad0>

In [23]:
class QnATestData():
    
    def __init__(self):
      self.input_ids = []
      self.token_type_ids = []
      self.attention_masks = []
      self.context_token_to_char = []
        
    def preprocess(self, context, questions):
      input_id = []

      for each_question in questions:

        # Clean context, answer and question
        context = " ".join(str(context).split())
        question = " ".join(str(each_question).split())

        # Tokenize context and question
        tokenized_context = tokenizer.encode(context)
        tokenized_question = tokenizer.encode(each_question)

        # Create inputs
        input_id = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_id = [0] * len(tokenized_context.ids) + [1] * len(tokenized_question.ids[1:])
        attention_mask = [1] * len(input_id)

        # Pad and create attention masks.
        # Skip if truncation is needed
        padding_length = max_len - len(input_id)
        
        if padding_length > 0:  # pad
            input_id = input_id + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_id = token_type_id + ([0] * padding_length)
        elif padding_length < 0:  # skip
            self.skip = True
            continue
        

        self.input_ids.append(input_id)
        self.token_type_ids.append(token_type_id)
        self.attention_masks.append(attention_mask)
        self.context_token_to_char.append(tokenized_context.offsets)

    def get_test_result(self, context, questions):
      pred_answer_list = []
      self.preprocess(context, questions)
      x = [
        np.array(self.input_ids),
        np.array(self.token_type_ids),
        np.array(self.attention_masks),
      ]

      pred_start, pred_end = model.predict(x)
      for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
        offsets = self.context_token_to_char[idx]
        start = np.argmax(start)
        end = np.argmax(end)
        if start >= len(offsets):
          print("start is greater the offsets")
          continue
        pred_char_start = offsets[start][0]


        if end < len(offsets):
            pred_char_end = offsets[end][1]
            pred_ans = context[pred_char_start:pred_char_end]
        else:
            pred_ans = context[idx][pred_char_start:]
        pred_answer_list.append(pred_ans)
      return pred_answer_list

In [24]:

context =[    
    '''Mike and Morris lived in the same village. While Morris owned the largest jewelry shop in the village, Mike was a poor farmer. Both had large families with many sons, daughters-in-law and grandchildren. One fine day, Mike, tired of not being able to feed his family, decided to leave the village and move to the city where he was certain to earn enough to feed everyone. Along with his family, he left the village for the city. At night, they stopped under a large tree. There was a stream running nearby where they could freshen up themselves. He told his sons to clear the area below the tree, he told his wife to fetch water and he instructed his daughters-in-law to make up the fire and started cutting wood from the tree himself. They didn’t know that in the branches of the tree, there was a thief hiding. He watched as Mike’s family worked together and also noticed that they had nothing to cook. Mike’s wife also thought the same and asked her husband ” Everything is ready but what shall we eat?”. Mike raised his hands to heaven and said ” Don’t worry. He is watching all of this from above. He will help us.” The thief got worried as he had seen that the family was large and worked well together. Taking advantage of the fact that they did not know he was hiding in the branches, he decided to make a quick escape. He climbed down safely when they were not looking and ran for his life. But, he left behind the bundle of stolen jewels and money which dropped into Mike’s lap. Mike opened it and jumped with joy when he saw the contents. The family gathered all their belongings and returned to the village. There was great excitement when they told everyone how they got rich.''',
    '''The culture of nuclear families is in fashion. Parents are often heard complaining about the difficulties in bringing up children these days. Too much of freedom in demand, too much independence; over night parties; excessive extravagance, splurging pocket money; no time for studies and family all this is a common cry of such families. Aren’t parents, themselves, responsible for this pitiful state ? The basic need of a growing youth is the family, love, attention and bonding along with moral values. One should not forget that ‘charity begins at home’.
Independence and individuality both need to be respected, in order to maintain the sanctity of family. Children, today are to be handled with tact in order to bridge the ever widening generation gap. Only the reasonable demands need to be fulfilled, as there are too many expenses to be met and top many social obligations to be taken care of by the parents. Our forefathers lived happily in joint families. Children loved to live with their cousins, learnt to adjust within means. There was perfect harmony between the generations. There never existed the concept of old-age homes. There was deep respect for the family elders and love, care and concern for the youngsters. Even the minor family differences were solved amicably.
 ''',
    '''Time is very valuable. If it is passed, it cannot be brought back even after spending lakhs and crores of rupees. Whoever has valued time in this world, he has lived life with happiness and him who wasted time, he himself is wasted. Ask the time the player who missed the medal by the hundredth of a second. The train standing at the station is missed by a minute. Nowadays, many schools are not even allowed to enter school if they come late. Students should understand the value of time even more because by appreciating this life, they can achieve their life goals. ''',
    '''The increasing population has given rise to many kinds of problems – bread, cloth, housing shortage, unemployment, illiteracy, reduction in the output of agriculture and industries, etc. As we progress or grow, the population increases in proportion to it. Our development is very less in front of the growing population and development work is not seen.
All government efforts appear unsuccessful in the face of a growing population. Agricultural production and industrial development are proving to be negligible in the face of a growing population. Keeping all these things in mind, there is an urgent need to control population growth. Without it, all the efforts made for development would be incomplete. '''
]

questions1 =[
    
    ["What Morris have?","What did Mike do for a living?","Why thief got upset?","How did the fellow villagers react the Mike getting rich overnight?","Who is Mike","Mike and Morris are they brothers?"],
    ["What is the benifit of joint family?","What is nuclear family?","Describe the atmosphere in joint families.","Why nuclear family is in trend?"],
    ["What is the valuable thing?","What will be the life of those who give importance to time?","Which person is himself ruined?","Every moment of time is precious. Which example is presented in the passage for this statement?","What inspiration do we get from this passage?"],
    ["What are the problem with growing population?","Why do we not see development work?","Which efforts seem unsuccessful in front of a growing population?","Which has decreased due to the increasing population?"]
] 

for i in range(len(context)):
    qna_test_obj = QnATestData()
    print(qna_test_obj.get_test_result(context[i], questions1[i]))

['stolen jewels and money', 'farmer', '', '', '', 'Mike and Morris lived in the same village. While Morris owned the largest jewelry shop in the village, Mike was a poor farmer.']
['Children loved to live with their cousins', 'culture', '', 'The culture of nuclear families is in fashion']
['Time', 'Whoever has valued time in this world, he has lived life with happiness and him who wasted time, he himself is wasted. Ask the time the player who missed the medal by the hundredth of a second. The train standing at the station is missed by a minute. Nowadays, many schools are not even allowed to enter school if they come late. Students should understand the value of time even more because by appreciating this life, they can achieve their life goals', 'Whoever has valued time in this world, he has lived life with happiness and him who wasted time, he himself is wasted. Ask the time the player', 'Time is very valuable', 'Time is very valuable. If it is passed, it cannot be brought back even a