In [2]:
from pathlib import Path
import json
import os
import random
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel


In [3]:

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad = json.load(f)
        

    contexts = []
    questions = []
    answers= []
    for group in squad['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

contexts, questions, answers = read_squad('train-v2.0.json')
data={"contexts":contexts,"ques":questions,"ans":answers}
df = pd.DataFrame.from_dict(data)
train,test= train_test_split(df, test_size=0.2)
train_contexts=train["contexts"].values.tolist()
train_questions=train["ques"].values.tolist()
train_answers=train["ans"].values.tolist()

train_df={'train_contexts':train_contexts, 'train_questions':train_questions, 'train_answers':train_answers}

test_contexts=test["contexts"].values.tolist()
test_questions=test["ques"].values.tolist()
test_answers=test["ans"].values.tolist()

test_df={'test_contexts':test_contexts ,'test_questions':test_questions, 'test_answers':test_answers}

In [None]:
def clean(df):
    for key in df.keys():
        df[key]=list(map(lambda x: " ".join(str(x).split()), df[key]))

In [4]:
# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)

In [19]:
def get_model_inputs(context,answer,questions):##Preprocess data
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": []}
    max_seq_length=512
    #print(dataset_dict)
    for con, ans,ques in zip(context,answer,questions):
        #print(dataset_dict)
        skip=False
        #preprocess answers
        start_char_idx=ans['answer_start']
        end_char_idx=start_char_idx+len(ans['text'])
        if end_char_idx>= len(context): ##the context passage is being cut off  , skip this data 
            skip=True
            #return
        char_in_ans=[0]*len(con) ## initialize all char in context to be 0
        for i in range(start_char_idx,end_char_idx):
                char_in_ans[i]=1                     ##mark all the answer char idx as 1
        ##tokenize context
        tokenized_context = tokenizer.encode(con) 
        ans_token_idx=[]
        for j , (start,end) in enumerate(tokenized_context.offsets): ## find all token(word) index in answers
            if sum(char_in_ans[start:end])>1:
                ans_token_idx.append(j)
        if len(ans_token_idx)== 0:          ## no answers found for this data
            skip=True
        else:
            start_token_idx=ans_token_idx[0]
            end_token_idx=ans_token_idx[-1]
        ##tokenize to get inputs 
        tokenized_question = tokenizer.encode(ques)
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(tokenized_question.ids[1:])
        attention_mask = [1] * len(input_ids)
        padding_length = max_seq_length - len(input_ids)
        # add padding if necessary
        if padding_length > 0:
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:
            skip = True
        final_keys=['input_ids','token_type_ids','attention_mask','start_token_idx','end_token_idx']
        final_inputs=[input_ids,token_type_ids,attention_mask,start_token_idx,end_token_idx]
        #mp=input_ids
        if(skip==False):##check valid data
            for q in range(len(final_keys)):
                dataset_dict[final_keys[q]].append(np.array(final_inputs[q]))
    for key in dataset_dict:
        dataset_dict[key] = np.asarray(dataset_dict[key])# model requires array as input
        
            #print(dataset_dict, "klk")
    x = [dataset_dict["input_ids"],dataset_dict["token_type_ids"],dataset_dict["attention_mask"]]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y
    
                
            
        
        
        
        
        
        
        
        
        
        
        
                
                
                
        
        

In [20]:
train_x,train_y=get_model_inputs(train_contexts,train_answers,train_questions)

#test_x,test_y=get_model_inputs(test_contexts,test_answers,test_questions)

# Not sure how to resolve the input type error.  Convert data to np.array already but still does not work for keras

In [None]:
## BERT encoder
encoder = TFBertModel.from_pretrained("bert-base-uncased")
max_len=512
## QA Model
input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)#
token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
embedding = encoder(
    input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
)[0]#get sequence output

start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)#for start position of answer
start_logits = layers.Flatten()(start_logits)

end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)#for end position of answer
end_logits = layers.Flatten()(end_logits)

start_probs = layers.Activation(keras.activations.softmax)(start_logits)
end_probs = layers.Activation(keras.activations.softmax)(end_logits)

model = keras.Model(inputs=[input_ids, token_type_ids, attention_mask],outputs=[start_probs, end_probs],)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
optimizer = keras.optimizers.Adam(lr=5e-5)
model.compile(optimizer=optimizer, loss=[loss, loss])
model.fit(train_x,train_y,epochs=2, verbose=2,batch_size=16)
model.save_weights("./weights.h5")
model.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Epoch 1/2


# Tried to convert data to tensor but does not work here for me

In [40]:
#train_x=tf.data.Dataset.from_tensor_slices(train_x)
#train_x=tf.data.Dataset.from_tensor_slices(train_x)

#test_x=tf.data.Dataset.from_tensor_slices(test_x)
#test_x=tf.data.Dataset.from_tensor_slices(test_x)

ValueError: Can't convert non-rectangular Python sequence to Tensor.