In [22]:
import json
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
import tensorflow as tf
import random
from tensorflow.python.keras.utils.data_utils import Sequence
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K
from nltk.tokenize import sent_tokenize
import tensorflow_hub as hub
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.optimizers import Adam

print(tf.__version__)
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

2.0.0


In [23]:
print(tf.test.is_gpu_available(cuda_only = False, min_cuda_compute_capability = None))

True


In [24]:
class BertLayer(tf.keras.layers.Layer):
    def __init__(self,
        n_fine_tune_layers=10,
        pooling="mean",
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
        **kwargs,):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path
        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path,
            trainable=self.trainable,
            name="{}_module".format(self.name)
        )
        trainable_vars = self.bert.variables
        
        # Remove unused layers
        trainable_vars = self.bert.variables
        if self.pooling == "first":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
            trainable_layers = ["pooler/dense"]

        elif self.pooling == "mean":
            trainable_vars = [
                var
                for var in trainable_vars
                if not "/cls/" in var.name and not "/pooler/" in var.name
            ]
            trainable_layers = []
        else:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)


    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        if self.pooling == "first":
            pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "pooled_output"
            ]
        elif self.pooling == "mean":
            result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]

            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result, input_mask)
        else:
            raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")

        return pooled

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)


In [35]:
def createModel_YesNo(vocab_size, batchSize, maxlen):
    bLayer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable=True)
    
    #Document
    input_ids_Document = tf.keras.layers.Input(shape = (maxlen, ), dtype = tf.int32)
    token_type_ids_Document = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32)
    attention_mask_Document = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32)
    
    bertInputs_Document = [input_ids_Document, token_type_ids_Document, attention_mask_Document]
    
    #bertOutput_Document = bLayer(n_fine_tune_layers = 3)(bertInputs_Document)
    bertOutput_Document, _ = bLayer(bertInputs_Document)
    
    #Question
    input_ids_Question = tf.keras.layers.Input(shape = (maxlen, ), dtype = tf.int32)
    token_type_ids_Question = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32)
    attention_mask_Question = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32)
    
    bertInputs_Question = [input_ids_Question, token_type_ids_Question, attention_mask_Question]
    
    #bertInputs_Question = bLayer(n_fine_tune_layers = 3)(bertInputs_Question)
    bertInputs_Question, _ = bLayer(bertInputs_Question)
    
    #Concat Layer
    concat = tf.keras.layers.Concatenate()([bertOutput_Document, bertInputs_Question])
    
    denseLayer = tf.keras.layers.Dense(128, activation = 'relu')(concat)
    denseLayer = tf.keras.layers.Flatten()(denseLayer)
    denseLayer = tf.keras.layers.Dense(2)(denseLayer)
    
    model = tf.keras.Model(inputs = [input_ids_Document, token_type_ids_Document, attention_mask_Document, input_ids_Question, token_type_ids_Question, attention_mask_Question],
                           outputs = [denseLayer])
    return model
    

In [36]:
BatchSize = 16
SeqLength = 10000

In [37]:
class trainGenSeq_short_YesNo(tf.keras.utils.Sequence, ):
    def __init__(self, batchSize, sentenceLength):
        self.batchSize = batchSize
        self.trainFiles = os.listdir('D:/Python/Datasets/v1.0/train/')
        self.trainingSamples = 307372 * 2
        self.sentenceLength = sentenceLength
        
        #Load Vocab
        slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        save_path = "bert_base_uncased/"
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        slow_tokenizer.save_pretrained(save_path)
        self.tokenizer = BertTokenizer('vocab.txt', lowercase = True)
        self.vocabSize = len(self.tokenizer.vocab)
        
    
    def __len__(self):
        return int(self.trainingSamples // self.batchSize)
    
    def getLen(self):
        return int(self.trainingSamples // self.batchSize)
    
    def attentionMasks(self,input_dims):
        return [int(id > 0) for id in input_dims]
        
    def inputDims(self, dims):
        return pad_sequences([dims], maxlen = self.sentenceLength, dtype="long", value=0, truncating="post", padding="post")[0]
    
    def encode_sentence(self, sentence):
        sentence = sent_tokenize(sentence)
        ans = []
        for i in range(len(sentence)):
            encode_sent = self.tokenizer.encode(sentence[i],add_special_tokens = True)
            ans += encode_sent

        ans = pad_sequences([ans], maxlen = self.sentenceLength, dtype = "long", value = 0, truncating = "post", padding = "post")
        return ans[0]
    
    def __getitem__(self, _):
        documentStack = np.array([])
        questionStack = np.array([])
        answerStack = np.array([])
        
        document_AttStack = np.array([])
        question_AttStack = np.array([])

        document_SegStack = np.array([])
        question_SegStack = np.array([])

        First = True
        
        for file in self.trainFiles:
            for line in open('D:/Python/Datasets/v1.0/train/' + file):
                file = json.loads(line)
                #annotations
                if file.get('annotations')[0].get('short_answers'):
                    s_Start = file.get('annotations')[0].get('short_answers')[0].get('start_token')
                    s_End = file.get('annotations')[0].get('short_answers')[0].get('end_token')
                    l_Start = file.get('annotations')[0].get('long_answer').get('start_token')
                    l_End = file.get('annotations')[0].get('long_answer').get('end_token')

                    #Question and Title
                    question = file.get('question_text')

                    #document
                    document = []
                    for indexs in file.get('document_tokens')[l_Start:l_End]:
                        if indexs.get('html_token') == False:
                            document.append(indexs.get('token'))
                    
                    #Fake Document OR No document
                    fake = []
                    randomNumber = random.randint(7500, 9000)
                    front = random.choice([True, False])
                    
                    if front:
                        try:
                            for indexs in range(max(0, l_Start - randomNumber), min(len(file.get('document_tokens')),l_End - randomNumber)):
                                if file.get('document_tokens')[indexs].get('html_token') == False:
                                    fake.append(file.get('document_tokens')[indexs].get('token'))
                                else:
                                    indexs -= 1
                        except:
                            for indexs in range(max(0, l_Start + randomNumber), min(len(file.get('document_tokens')),l_End + randomNumber)):
                                if file.get('document_tokens')[indexs].get('html_token') == False:
                                    fake.append(file.get('document_tokens')[indexs].get('token'))
                                else:
                                    indexs -= 1
                    else:
                        try:
                            for indexs in range(max(0, l_Start + randomNumber), min(len(file.get('document_tokens')),l_End + randomNumber)):
                                if file.get('document_tokens')[indexs].get('html_token') == False:
                                    fake.append(file.get('document_tokens')[indexs].get('token'))
                                else:
                                    indexs -= 1
                        except:
                            for indexs in range(max(0, l_Start - randomNumber), min(len(file.get('document_tokens')),l_End - randomNumber)):
                                if file.get('document_tokens')[indexs].get('html_token') == False:
                                    fake.append(file.get('document_tokens')[indexs].get('token'))
                                else:
                                    indexs -= 1
                    
                    document = ' '.join(document)
                    fake = ' '.join(document)

                    document = self.encode_sentence(document)
                    fake = self.encode_sentence(fake)
                    question = self.encode_sentence(question)
                    
                    fake_AttentionMask = self.attentionMasks(fake)
                    document_AttentionMask = self.attentionMasks(document)
                    question_AttentionMask = self.attentionMasks(question)
                    
                    fake_SegID = [0 for _ in range(len(fake))]
                    document_SegID = [0 for _ in range(len(document))]
                    question_SegID = [0 for _ in range(len(question))]

                    if First:
                        #Document
                        documentStack = np.array([document])
                        documentStack = np.append(documentStack, np.array([fake]), axis = 0)
                        document_AttStack = np.array([document_AttentionMask])
                        document_AttStack = np.append(document_AttStack, np.array([fake_AttentionMask]), axis = 0)
                        document_SegStack = np.array([document_SegID])
                        document_SegStack = np.append(document_SegStack, np.array([fake_AttentionMask]), axis = 0)
                        
                        #Add Question Again
                        questionStack = np.array([question])
                        questionStack = np.append(questionStack, np.array([question]), axis = 0)
                        
                        question_AttStack = np.array([question_AttentionMask])
                        question_AttStack = np.append(question_AttStack, np.array([question_AttentionMask]), axis = 0)
                        question_SegStack = np.array([question_SegID])
                        question_SegStack = np.append(question_SegStack, np.array([question_SegID]), axis = 0)
                        
                        #Add Answer
                        answerStack = np.array([np.array([1,0])])
                        answerStack = np.append(answerStack, np.array([np.array([0,1])]), axis = 0)
                        
                        First = False
                    else:
                        documentStack = np.append(documentStack, np.array([document]), axis = 0)
                        documentStack = np.append(documentStack, np.array([fake]), axis = 0)
                        questionStack = np.append(questionStack, np.array([question]), axis = 0)
                        questionStack = np.append(questionStack, np.array([question]), axis = 0)
                        answerStack = np.append(answerStack, np.array([np.array([1,0])]), axis = 0)
                        answerStack = np.append(answerStack, np.array([np.array([0,1])]), axis = 0)
                        
                        #Attention Mask
                        document_AttStack = np.append(document_AttStack, np.array([document_AttentionMask]), axis = 0)
                        document_AttStack = np.append(document_AttStack, np.array([fake_AttentionMask]), axis = 0)
                        question_AttStack = np.append(question_AttStack, np.array([question_AttentionMask]), axis = 0)
                        question_AttStack = np.append(question_AttStack, np.array([question_AttentionMask]), axis = 0)
                        
                        #SegmentIDs
                        document_SegStack = np.append(document_SegStack, np.array([document_AttentionMask]), axis = 0)
                        document_SegStack = np.append(document_SegStack, np.array([fake_AttentionMask]), axis = 0)
                        question_SegStack = np.append(question_SegStack, np.array([question_SegID]), axis = 0)
                        question_SegStack = np.append(question_SegStack, np.array([question_SegID]), axis = 0)
                
                if documentStack.shape[0] == self.batchSize:
                    documentStack = np.reshape(documentStack, (documentStack.shape[0], 1, documentStack.shape[1]))
                    questionStack = np.reshape(questionStack, (questionStack.shape[0], 1, questionStack.shape[1]))
                    answerStack = np.reshape(answerStack, (answerStack.shape[0], 1, answerStack.shape[1]))
                    First = True

                    #print(type(documentStack), type(questionStack), type(answerStack))
                    return [np.squeeze(documentStack), np.squeeze(document_AttStack), np.squeeze(document_SegStack), 
                            np.squeeze(questionStack), np.squeeze(question_AttStack), np.squeeze(question_SegStack)], np.squeeze(answerStack)
                    
                    documentStack = None
                    titleStack = None
                    questionStack = None
                    answerStack = None

trainGen = trainGenSeq_short_YesNo(BatchSize, SeqLength)

In [38]:
model = createModel_YesNo(trainGen.vocabSize, BatchSize, SeqLength)
model.summary()
model.compile(optimizer = Adam(), loss = 'categorical_crossentropy')

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           [(None, 10000)]      0                                            
__________________________________________________________________________________________________
input_26 (InputLayer)           [(None, 10000)]      0                                            
__________________________________________________________________________________________________
input_27 (InputLayer)           [(None, 10000)]      0                                            
__________________________________________________________________________________________________
input_28 (InputLayer)           [(None, 10000)]      0                                            
____________________________________________________________________________________________

In [39]:
model.fit(trainGen, epochs = 10, steps_per_epoch = trainGen.getLen(),verbose = 1)

Epoch 1/10


CancelledError: [_Derived_]RecvAsync is cancelled.
	 [[{{node loss_3/mul}}]]