In [28]:
from pathlib import Path
import json
import os
import random
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel


In [159]:

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad = json.load(f)
        

    contexts = []
    questions = []
    answers= []
    for group in squad['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

contexts, questions, answers = read_squad('train-v2.0.json')
data={"contexts":contexts,"ques":questions,"ans":answers}
df = pd.DataFrame.from_dict(data)
train,test= train_test_split(df, test_size=0.2)
train_contexts=train["contexts"].values.tolist()
train_questions=train["ques"].values.tolist()
train_answers=train["ans"].values.tolist()

train_df={'train_contexts':train_contexts, 'train_questions':train_questions, 'train_answers':train_answers}

test_contexts=test["contexts"].values.tolist()
test_questions=test["ques"].values.tolist()
test_answers=test["ans"].values.tolist()

test_df={'test_contexts':test_contexts ,'test_questions':test_questions, 'test_answers':test_answers}

In [9]:
def clean(df):
    for key in df.keys():
        df[key]=list(map(lambda x: " ".join(str(x).split()), df[key]))

In [156]:
train_answers

[{'text': 'John F. Kennedy, James Monroe, Taft, Theodore Roosevelt, Adlai Stevenson, Evander Childs, Christopher Columbus, Morris, Walton, and South Bronx High Schools',
  'answer_start': 281},
 {'text': 'Chinese character dictionaries', 'answer_start': 0},
 {'text': '2007', 'answer_start': 80},
 {'text': "People's Daily", 'answer_start': 17},
 {'text': 'MiMo', 'answer_start': 512},
 {'text': 'Diego Maradona', 'answer_start': 702},
 {'text': 'both single leg and double leg work', 'answer_start': 41},
 {'text': 'a few days', 'answer_start': 178},
 {'text': 'administrator von Kállay', 'answer_start': 1287},
 {'text': 'Frederick the Great,', 'answer_start': 0},
 {'text': '"going negative" carries risks', 'answer_start': 800},
 {'text': 'a formal clergy-laity division', 'answer_start': 695},
 {'text': 'decisive experiments', 'answer_start': 263},
 {'text': 'Chittenden Avenue', 'answer_start': 92},
 {'text': 'extreme programming and the agile software development',
  'answer_start': 56},
 {

In [42]:
# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)

In [123]:
def ds():
    a=1+2
    return a



3

In [177]:
def get_model_inputs(context,answer,questions):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": []}
    mp=0
    #print(dataset_dict)
    for con, ans,ques in zip(context,answer,questions):
        #print(dataset_dict)
        skip=False
        #preprocess answers
        start_char_idx=ans['answer_start']
        end_char_idx=start_char_idx+len(ans['text'])
        if end_char_idx>= len(context): ##the context passage is being cut off  , skip this data 
            skip=True
            #return
        char_in_ans=[0]*len(con) ## initialize all char in context to be 0
        for i in range(start_char_idx,end_char_idx):
                char_in_ans[i]=1                     ##mark all the answer char idx as 1
        ##tokenize context
        tokenized_context = tokenizer.encode(con) 
        ans_token_idx=[]
        for j , (start,end) in enumerate(tokenized_context.offsets): ## find all token(word) index in answers
            if sum(char_in_ans[start:end])>1:
                ans_token_idx.append(j)
        if len(ans_token_idx)== 0:          ## no answers found for this data
            skip=True
        else:
            start_token_idx=ans_token_idx[0]
            end_token_idx=ans_token_idx[-1]
        ##tokenize to get inputs 
        tokenized_question_context=slow_tokenizer(con,ques,padding=True,truncation=True)
        input_ids=tokenized_question_context['input_ids']
        attention_mask=tokenized_question_context['attention_mask']
        token_type_ids=tokenized_question_context['token_type_ids']
        final_keys=['input_ids','token_type_ids','attention_mask','start_token_idx','end_token_idx']
        final_inputs=[input_ids,token_type_ids,attention_mask,start_token_idx,end_token_idx]
        mp=input_ids
        if(skip==False):##check valid data
            for q in range(len(final_keys)):
                dataset_dict[final_keys[q]].append(final_inputs[q])
            #print(dataset_dict, "klk")
    #x = [dataset_dict["input_ids"],dataset_dict["token_type_ids"],dataset_dict["attention_mask"]]
    #y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return dataset_dict
    
                
            
        
        
        
        
        
        
        
        
        
        
        
                
                
                
        
        

In [178]:
train_input=get_model_inputs(train_contexts,train_answers,train_questions)
#test_x,test_y=get_model_inputs(test_contexts,test_answers,test_questions)

In [179]:
train_input

{'input_ids': [[101,
   2198,
   22038,
   28954,
   2351,
   1997,
   2566,
   9956,
   3490,
   7315,
   3303,
   2011,
   1037,
   2566,
   29278,
   4383,
   4308,
   2012,
   2539,
   1024,
   4749,
   2334,
   2051,
   2006,
   1017,
   2238,
   3699,
   2012,
   1996,
   2287,
   1997,
   6282,
   1010,
   4566,
   1037,
   3181,
   21179,
   18513,
   3686,
   1997,
   2176,
   2086,
   1998,
   2698,
   2706,
   1012,
   2002,
   2351,
   2074,
   2004,
   1037,
   3742,
   2005,
   2032,
   2736,
   1999,
   3002,
   2848,
   1005,
   1055,
   2675,
   2917,
   1010,
   6334,
   2011,
   15153,
   19817,
   8490,
   6632,
   1012,
   2044,
   2002,
   2351,
   1010,
   2010,
   8306,
   2001,
   8887,
   2135,
   10410,
   2000,
   2156,
   2065,
   2002,
   2001,
   2757,
   1010,
   1998,
   2216,
   2007,
   2032,
   1999,
   1996,
   2282,
   2056,
   12583,
   1012,
   2059,
   1996,
   2282,
   2001,
   14640,
   1010,
   2947,
   21672,
   1996,
   2111,
   1997,
   20