In [86]:
import json
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
import tensorflow as tf
import random
from tensorflow.python.keras.utils.data_utils import Sequence

import bert
from bert import tokenization
from bert import bert_tokenization
import tensorflow_hub as hub

In [87]:
def createModel_YesNo(vocab_size, batchSize):
    #Document
    document_input = tf.keras.layers.Input(batch_shape = [batchSize, None])
    document = tf.keras.layers.Embedding(vocab_size, 256, batch_input_shape = [batchSize, None])(document_input)
    document = tf.keras.layers.GRU(1024, return_sequences = True, stateful = True, recurrent_initializer='glorot_uniform')(document)
    document = tf.keras.layers.Dense(256, activation = 'relu')(document)
    document_model = tf.keras.models.Model(inputs = document_input, outputs = document)
    
    #Question
    question_input = tf.keras.layers.Input(batch_shape = [batchSize, None])
    question = tf.keras.layers.Embedding(vocab_size, 256)(question_input)
    question = tf.keras.layers.GRU(1024, return_sequences = True, stateful = True, recurrent_initializer='glorot_uniform')(question)
    question = tf.keras.layers.Dense(126, activation = 'relu')(question)
    question_model = tf.keras.models.Model(inputs = question_input, outputs = question)
    
    #concat
    model = tf.keras.layers.concatenate([document_model.output, question_model.output])
    model = tf.keras.layers.Dense(2, activation = 'softmax')(model)
    
    finalModel = tf.keras.models.Model(inputs = [document_input, question_input], outputs = model)
    
    return finalModel

In [92]:
class trainGenSeq_short_YesNo(tf.keras.utils.Sequence, ):
    def __init__(self, batchSize, sentenceLength):
        self.batchSize = batchSize
        self.trainFiles = os.listdir('D:/Python/Datasets/v1.0/train/')
        self.tokenizer = self.loadTokenizer()
        self.trainingSamples = 307372
        self.sentenceLength = sentenceLength
        
        #Load Vocab
        self.tokenizer = bert_tokenization.FullTokenizer(vocab_file='D:/Python/Q_A/uncased_L-24_H-1024_A-16/vocab.txt', do_lower_case=True)
        self.vocabSize = len(self.tokenizer.vocab)
        
    
    def __len__(self):
        return int(self.trainingSamples // self.batchSize)
    
    def getLen(self):
        return int(self.trainingSamples // self.batchSize)
    
    def loadTokenizer(self):
        myList = []
        for line in open('vocab_word.txt'):
            myList.append(line[:len(line)-1])
        tk = tf.keras.preprocessing.text.Tokenizer(num_words=len(myList))
        tk.fit_on_texts(myList)
        return tk
    
    def encode_sentence(self, sentence):
        ans = list(self.tokenizer.tokenize(sentence))
        ans.append('[SEP]')
        ans = self.tokenizer.convert_tokens_to_ids(ans)
        ans = ans + ([0] * (self.sentenceLength - len(ans))) 
        return ans
    
    def __getitem__(self, _):
        documentStack = np.array([])
        questionStack = np.array([])
        answerStack = np.array([])
        First = True
        
        for file in self.trainFiles:
            for line in open('D:/Python/Datasets/v1.0/train/' + file):
                file = json.loads(line)
                
                #annotations
                if file.get('annotations')[0].get('short_answers'):
                    s_Start = file.get('annotations')[0].get('short_answers')[0].get('start_token')
                    s_End = file.get('annotations')[0].get('short_answers')[0].get('end_token')
                    l_Start = file.get('annotations')[0].get('long_answer').get('start_token')
                    l_End = file.get('annotations')[0].get('long_answer').get('end_token')

                    #Question and Title
                    question = file.get('question_text')

                    #document
                    document = []
                    for indexs in file.get('document_tokens')[l_Start:l_End]:
                        if indexs.get('html_token') == False:
                            document.append(indexs.get('token'))
                    
                    #Fake Document OR No document
                    fake = []
                    randomNumber = random.randint(int(0.75 * self.sentenceLength), self.sentenceLength)
                    front = random.choice([True, False])
                    
                    if front:
                        try:
                            for indexs in range(max(0, l_Start - randomNumber), min(len(file.get('document_tokens')),l_End - randomNumber)):
                                if file.get('document_tokens')[indexs].get('html_token') == False:
                                    fake.append(file.get('document_tokens')[indexs].get('token'))
                                else:
                                    indexs -= 1
                        except:
                            for indexs in range(max(0, l_Start + randomNumber), min(len(file.get('document_tokens')),l_End + randomNumber)):
                                if file.get('document_tokens')[indexs].get('html_token') == False:
                                    fake.append(file.get('document_tokens')[indexs].get('token'))
                                else:
                                    indexs -= 1
                    else:
                        try:
                            for indexs in range(max(0, l_Start + randomNumber), min(len(file.get('document_tokens')),l_End + randomNumber)):
                                if file.get('document_tokens')[indexs].get('html_token') == False:
                                    fake.append(file.get('document_tokens')[indexs].get('token'))
                                else:
                                    indexs -= 1
                        except:
                            for indexs in range(max(0, l_Start - randomNumber), min(len(file.get('document_tokens')),l_End - randomNumber)):
                                if file.get('document_tokens')[indexs].get('html_token') == False:
                                    fake.append(file.get('document_tokens')[indexs].get('token'))
                                else:
                                    indexs -= 1
                    
                    document = ' '.join(document)
                    fake = ' '.join(document)

                    document = self.encode_sentence(document)
                    fake = self.encode_sentence(fake)
                    question = self.encode_sentence(question)

                    if First:
                        documentStack = np.array([document])
                        documentStack = np.append(documentStack, np.array([fake]), axis = 0)
                        #Add Question Again
                        questionStack = np.array([question])
                        questionStack = np.append(questionStack, np.array([question]), axis = 0)
                        #Add Answer
                        answerStack = np.array([[1,0]])
                        answerStack = np.append(answerStack, np.array([[0,1]]), axis = 0)
                        First = False
                    else:
                        documentStack = np.append(documentStack, np.array([document]), axis = 0)
                        documentStack = np.append(documentStack, np.array([fake]), axis = 0)
                        questionStack = np.append(questionStack, np.array([question]), axis = 0)
                        questionStack = np.append(questionStack, np.array([question]), axis = 0)
                        answerStack = np.append(answerStack, np.array([[1,0]]), axis = 0)
                        answerStack = np.append(answerStack, np.array([[0,1]]), axis = 0)
                
                if documentStack.shape[0] == self.batchSize:
                    print(documentStack.shape, questionStack.shape, answerStack.shape)
                    First = True

                    #print(type(documentStack), type(questionStack), type(answerStack))
                    yield [documentStack, questionStack], answerStack
                    #return documentStack, answerStack
                    
                    documentStack = None
                    titleStack = None
                    questionStack = None
                    #answerStack = None

In [93]:
trainGen = trainGenSeq_short_YesNo(64, 5000)

model = createModel_YesNo(trainGen.vocabSize, 64)

In [94]:
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy')

In [95]:
model.fit(trainGen, epochs = 10, steps_per_epoch = trainGen.getLen(),verbose = 1)

(64, 5000) (64, 5000) (64, 2)


TypeError: int() argument must be a string, a bytes-like object or a number, not 'tuple'