# BERT - example

Example extracted from https://keras.io/examples/nlp/text_extraction_with_bert/

In [1]:
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
from tensorflow.python.client import device_lib

print("Local devices visible to tensorflow:")
devices = [d.name for d in device_lib.list_local_devices()]
print(', '.join(devices))

Local devices visible to tensorflow:
/device:CPU:0, /device:XLA_CPU:0, /device:XLA_GPU:0, /device:GPU:0


In [3]:
max_len = 254
# default parameters and configuration for BERT
configuration = BertConfig()

## Setup BERT Tokenizer

In [4]:
slow_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
save_path = 'bert_base_uncased/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer(f'{save_path}vocab.txt', lowercase=True)

## Load the data

In [5]:
train_data_url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json'
train_path = keras.utils.get_file("train.json", train_data_url)
eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
eval_path = keras.utils.get_file("eval.json", eval_data_url)

## Process the data

In [6]:
class SquadExample:
    def __init__(self, question, context, start_char_idx, answer_text, all_answers):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.skip = False
        
    def preprocess(self):
        context = self.context
        question = self.question
        answer_text = self.answer_text
        start_char_idx = self.start_char_idx
        
        # Clean context, answer and question
        context = context.strip()
        question = question.strip()
        answer = answer_text.strip()
        
        # find end character index of answer in context
        end_char_idx = start_char_idx + len(answer)
        # if the answer is longer than the context, skip it
        if end_char_idx >= len(context):
            self.skip = True
            return
        
        # Mark the character indexes in context that are in answer
        is_char_in_ans = [0] * len(context)
        for idx in range(start_char_idx, end_char_idx):
            is_char_in_ans[idx] = 1
            
        # tokenize the context
        tokenized_context = tokenizer.encode(context)
        
        # Find tokens that were created from answer characters
        ans_token_idx = []
        for idx, (start, end) in enumerate(tokenized_context.offsets):
            if sum(is_char_in_ans[start:end]) > 0:
                ans_token_idx.append(idx)
        
        if len(ans_token_idx) == 0:
            self.skip = True
            return
        
        # Find start and end tokens index for tokens from answer
        start_token_idx = ans_token_idx[0]
        end_token_idx = ans_token_idx[-1]
        
        # tokenize the question
        tokenized_question = tokenizer.encode(question)
        
        # Create the inputs
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(tokenized_question.ids[1:])
        attention_mask = [1] * len(input_ids)
        
        # Pad and create attention masks.
        # Skip if truncation is needed
        padding_length = max_len - len(input_ids)
        if padding_length > 0:
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:
            self.skip = True
            return
        
        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.start_token_idx = start_token_idx
        self.end_token_idx = end_token_idx
        self.context_token_to_char = tokenized_context.offsets
        

In [7]:
with open(train_path) as f:
    raw_train_data = json.load(f)
    
with open(eval_path) as f:
    raw_eval_data = json.load(f)
    
def create_squad_example(raw_data):
    squad_examples = []
    for item in raw_data['data']:
        for para in item['paragraphs']:
            context = para['context']
            for qa in para['qas']:
                question = qa['question']
                answer_text = qa['answers'][0]['text']
                all_answers = [_['text'] for _ in qa['answers']]
                start_char_idx = qa['answers'][0]['answer_start']
                squad_eg = SquadExample(
                    question, context, start_char_idx, answer_text, all_answers
                )
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
    return squad_examples

def create_input_targets(squad_examples):
    dataset_dict = {
        'input_ids': [],
        'token_type_ids': [],
        'attention_mask': [],
        'start_token_idx': [],
        'end_token_idx': []
    }
    for item in squad_examples:
        if item.skip == False:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])
        
    x = [
        dataset_dict['input_ids'],
        dataset_dict['token_type_ids'],
        dataset_dict['attention_mask']
    ]
    y = [
        dataset_dict['start_token_idx'],
        dataset_dict['end_token_idx']
    ]
    return x, y

In [8]:
train_squad_examples = create_squad_example(raw_train_data)
x_train, y_train = create_input_targets(train_squad_examples)
print(f"{len(train_squad_examples)} training points created")

eval_squad_examples = create_squad_example(raw_eval_data)
x_eval, y_eval = create_input_targets(eval_squad_examples)
print(f"{len(eval_squad_examples)} evaluation points created")

87599 training points created
10570 evaluation points created


### Create the Question-Answering Model using BERT and functional API

In [9]:
def create_model():
    encoder = TFBertModel.from_pretrained('bert-base-uncased')
    
    ## QA Model
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    embedding = encoder(
        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )[0]
    
    start_logits = layers.Dense(1, name='start_logit', use_bias=False)(embedding)
    start_logits = layers.Flatten()(start_logits)
    
    end_logits = layers.Dense(1, name='end_logits', use_bias=False)(embedding)
    end_logits = layers.Flatten()(end_logits)
    
    start_props = layers.Activation(keras.activations.softmax)(start_logits)
    end_props = layers.Activation(keras.activations.softmax)(end_logits)
    
    model = keras.Model(
        inputs = [input_ids, token_type_ids, attention_mask],
        outputs = [start_props, end_props]
    )
    
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])
    return model

In [10]:
use_tpu = False
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model()
else:
    model = create_model()

model.summary()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 254)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 254)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 254)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 254, 768), ( 109482240   input_1[0][0]                    
______________________________________________________________________________________________

### Create evaluation Callback

In [11]:
def normalize_text(text):
    text = text.lower()
    
    # remove punctuations
    exclude = set(string.punctuation)
    text = "".join(ch for ch in text if ch not in exclude)
    
    # remove articles
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    text = re.sub(regex, " ", text)
    
    # remove extra white space
    text = " ".join(text.split())
    
    return text

class ExactMatch(keras.callbacks.Callback):
    """
    Each 'SquadExample' object contains the character level offsets for each token in its input
    paragrapch. We use them to get back the span of the text corresponding to the tokens between
    our predicted start and end tokens.
    All the ground-truth answers are also present in each 'SquadExample' object. We calculate the
    percentage of data points where the span of text obtained from model predictions mathces one
    of the ground-truth answers.
    """
    def __init__(self, x_eval, y_eval):
        self.x_eval = x_eval
        self.y_eval = y_eval
        
    def on_epoch_end(self, epoch, logs=None):
        pred_start, pred_end = self.model.predict(self.x_eval)
        count = 0
        eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]
        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            squad_eg = eval_examples_no_skip[idx]
            offsets = squad_eg.context_token_to_char
            start = np.argmax(start)
            end = np.argmax(end)
            if start >= len(offsets):
                continue
            pred_char_start = offsets[start][0]
            if end < len(offsets):
                pred_char_end = offsets[end][1]
                pred_ans = squad_eg.context[pred_char_start:pred_char_end]
            else:
                pred_ans = squad_eg.context[pred_char_start:]
                
            normalized_pred_ans = normalize_text(pred_ans)
            normalized_true_ans = [normalize_text(_) for _ in squad_eg.all_answers]
            if normalized_pred_ans in normalized_true_ans:
                count += 1
        acc = count / len(self.y_eval[0])
        print(f'\nepoch={epoch+1}, exact match score={acc:.2f}')

## Train and evaluate

In [12]:
train_size = x_eval[0].shape[0]
X = [x_train[0][:train_size], x_train[1][:train_size], x_train[2][:train_size]]
Y = [y_train[0][:train_size], y_train[1][:train_size]]

In [13]:
%%time
exact_match_callback = ExactMatch(x_eval, y_eval)
model.fit(
    X,
    Y,
    epochs=1,
    batch_size=8,
    callbacks=[exact_match_callback]
)

Train on 9316 samples
epoch=1, exact match score=0.63
CPU times: user 5min 10s, sys: 58 s, total: 6min 8s
Wall time: 9min 8s


<tensorflow.python.keras.callbacks.History at 0x7f820975c150>