In [28]:
import json
import os
import sys
import tensorflow as tf
import numpy as np
from tensorflow.python.keras.utils.data_utils import Sequence

In [29]:
import warnings
warnings.filterwarnings('ignore')

In [30]:
def createModel(vocab_size, batchSize):
    #Document
    document_input = tf.keras.layers.Input(batch_shape = [batchSize, None])
    document = tf.keras.layers.Embedding(vocab_size, 256, batch_input_shape = [batchSize, None])(document_input)
    document = tf.keras.layers.GRU(1024, return_sequences = True, stateful = True, recurrent_initializer='glorot_uniform')(document)
    document = tf.keras.layers.Dense(256, activation = 'relu')(document)
    document_model = tf.keras.models.Model(inputs = document_input, outputs = document)
    
    #Question
    question_input = tf.keras.layers.Input(batch_shape = [batchSize, None])
    question = tf.keras.layers.Embedding(vocab_size, 256)(question_input)
    question = tf.keras.layers.GRU(1024, return_sequences = True, stateful = True, recurrent_initializer='glorot_uniform')(question)
    question = tf.keras.layers.Dense(126, activation = 'relu')(question)
    question_model = tf.keras.models.Model(inputs = question_input, outputs = question)
    
    #concat
    model = tf.keras.layers.concatenate([document_model.output, question_model.output])
    model = tf.keras.layers.Dense(vocab_size)(model)
    
    finalModel = tf.keras.models.Model(inputs = [document_input, question_input], outputs = model)
    
    return finalModel

In [34]:
class trainGenSeq_short(tf.keras.utils.Sequence, ):
    def __init__(self, batchSize):
        self.batchSize = batchSize
        self.trainFiles = os.listdir('D:/Python/Datasets/v1.0/train/')
        self.tokenizer = self.loadTokenizer()
        self.trainingSamples = 307372
    
    def __len__(self):
        return int(self.trainingSamples // self.batchSize)
    
    def getLen(self):
        return int(self.trainingSamples // self.batchSize)
    
    def loadTokenizer(self):
        myList = []
        for line in open('vocab_word.txt'):
            myList.append(line[:len(line)-1])
        tk = tf.keras.preprocessing.text.Tokenizer(num_words=len(myList))
        tk.fit_on_texts(myList)
        return tk
    
    def __getitem__(self, _):
        documentStack = np.array([])
        titleStack = np.array([])
        questionStack = np.array([])
        answerStack = np.array([])
        First = True
        
        for file in self.trainFiles:
            for line in open('D:/Python/Datasets/v1.0/train/' + file):
                file = json.loads(line)
                
                #annotations
                if file.get('annotations')[0].get('short_answers'):
                    s_Start = file.get('annotations')[0].get('short_answers')[0].get('start_token')
                    s_End = file.get('annotations')[0].get('short_answers')[0].get('end_token')

                    #Question and Title
                    question = file.get('question_text')
                    title = file.get('document_title')

                    #document
                    document = []
                    for indexs in file.get('document_tokens'):
                        if indexs.get('html_token') == False:
                            document.append(indexs.get('token'))

                    #answer
                    answer = []
                    for index in range(s_Start,s_End):
                        if file.get('document_tokens')[index].get('html') == False:
                            answer.append(file.get('document_tokens')[index].get('token'))

                    #Convert to Array
                    document = tf.keras.preprocessing.sequence.pad_sequences(self.tokenizer.texts_to_sequences(document), maxlen = 1000000)
                    answer = tf.keras.preprocessing.sequence.pad_sequences(self.tokenizer.texts_to_sequences(' '.join(answer)), maxlen = 1000000)
                    title = tf.keras.preprocessing.sequence.pad_sequences(self.tokenizer.texts_to_sequences(title), maxlen = 1000000)
                    answer = tf.keras.preprocessing.sequence.pad_sequences(self.tokenizer.texts_to_sequences(' '.join(answer)), maxlen = 1000000)
                    
                    

                    if First:
                        documentStack = document
                        titleStack = title
                        questionStack = question
                        answerStack = answer
                        First = False
                    else:
                        print(documentStack.shape, document.shape)
                        documentStack = np.stack((documentStack, document))
                        titleStack = np.concatenate((titleStack, title))
                        questionStack = np.concatenate((questionStack, question))
                        answerStack = np.concatenate((answerStack, answer))
                
                if documentStack.shape[0] == self.batchSize:
                    First = True
                    #documentStack = np.array(documentStack)
                    #questionStack = np.array(questionStack)
                    #answerStack = np.array(answerStack)
                    
                    #print(type(documentStack), type(questionStack), type(answerStack))
                    return [documentStack, questionStack], answerStack
                    #return documentStack, answerStack
                    
                    documentStack = None
                    titleStack = None
                    questionStack = None
                    answerStack = None

In [35]:
myList = []
for line in open('vocab_word.txt'):
    myList.append(line[:len(line)-1])

batchSize = 64
vocabSize = len(myList)

model = createModel(vocabSize, batchSize)
model.summary()

model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy')

Model: "model_41"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_27 (InputLayer)           [(64, None)]         0                                            
__________________________________________________________________________________________________
input_28 (InputLayer)           [(64, None)]         0                                            
__________________________________________________________________________________________________
embedding_26 (Embedding)        (64, None, 256)      7813632     input_27[0][0]                   
__________________________________________________________________________________________________
embedding_27 (Embedding)        (64, None, 256)      7813632     input_28[0][0]                   
___________________________________________________________________________________________

In [36]:
trainSeq = trainGenSeq_short(batchSize)

history = model.fit(         trainSeq,
                             epochs = 2,
                             steps_per_epoch = trainSeq.getLen(),
                             verbose = 1)

MemoryError: 