In [None]:
import tensorflow as tf

import os
import random
import json

# load BERT modules
from official import nlp
import official.nlp.bert as bert
import official.nlp.bert.tokenization as tokenization
import official.nlp.bert.configs as configs
import official.nlp.bert.bert_models as bert_models
import official.nlp.optimization

print(f'Tensorflow version {tf.__version__}')

# disable warning messages
tf.get_logger().setLevel('ERROR')

In [None]:
# data_path = ['media', 'data']
data_path = ['D:\\', 'data']

In [None]:
def json_load(name):
    with open(f'{name}.json', 'r', encoding = 'utf-8') as f:
        return json.load(f)
    
def json_save(name, item):
    with open(f'{name}.json', 'w', encoding = 'utf-8') as f:
        json.dump(item, f, ensure_ascii = False, indent = 2)

In [None]:
# make path from elements so it works both on windows and linux 
file_bert = ['pretrained', 'assets', 'vocab.txt']

# set up tokenizer to generate Tensorflow dataset
tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join(*file_bert))

print(f'Vocab size: {len(tokenizer.vocab)}')

In [None]:
config_dict = {
    'attention_probs_dropout_prob': 0.1,
    'hidden_act': 'gelu',
    'hidden_dropout_prob': 0.1,
    'hidden_size': 768,
    'initializer_range': 0.02,
    'intermediate_size': 3072,
    'max_position_embeddings': 512,
    'num_attention_heads': 12,
    'num_hidden_layers': 12,
    'type_vocab_size': 2,
    'vocab_size': 30522}

bert_config = configs.BertConfig.from_dict(config_dict)

In [None]:
# convert sentence to tokens
def encode_sentence(s):
    tokens = list(tokenizer.tokenize(s)) + ['[SEP]']
    return tokenizer.convert_tokens_to_ids(tokens)

def encode_pair(q, a, max_size):
    q_tok = ['[CLS]'] + tokenizer.tokenize(q) + ['[SEP]']
    a_tok = tokenizer.tokenize(a) + ['[SEP]']
    ids = tokenizer.convert_tokens_to_ids(q_tok + a_tok)
    
    if len(ids) > max_size:
        raise IndexError('Too many tokens')
    else:
        inputs = {
            'input_word_ids': ids + [0]*(max_size - len(ids)),
            'input_mask': [1]*len(ids) + [0]*(max_size - len(ids)),
            'input_type_ids': [0]*len(q_tok) + [1]*len(a_tok) + [0]*(max_size - len(ids))
        }
        
        return inputs
    
assert(encode_sentence('Human is instance of animal') == [2529, 2003, 6013, 1997, 4111, 102])
assert(
    encode_pair('Who are you?', 'I am your dad.', 15)['input_word_ids'] ==
    [101, 2040, 2024, 2017, 1029, 102, 1045, 2572, 2115, 3611, 1012, 102, 0, 0, 0]
)

In [None]:
def bert_encode(class_0, class_1, tokenizer, size=0):
    random.shuffle(class_0[:size] if size else class_0)
    random.shuffle(class_1[:size] if size else class_1)

    labels = [0]*len(class_0) + [1]*len(class_1)
    records = class_0 + class_1

    questions = tf.ragged.constant([encode_sentence(s[0]) for s in records])
    answers = tf.ragged.constant([encode_sentence(s[1]) for s in records])

    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*questions.shape[0]
    input_word_ids = tf.concat([cls, questions, answers], axis=-1)

    input_mask = tf.ones_like(input_word_ids).to_tensor()

    type_cls = tf.zeros_like(cls)
    type_question = tf.zeros_like(questions)
    type_answer = tf.ones_like(answers)
    input_type_ids = tf.concat([type_cls, type_question, type_answer], axis=-1).to_tensor()

    inputs = {
        'input_word_ids': input_word_ids.to_tensor(),
        'input_mask': input_mask,
        'input_type_ids': input_type_ids}

    return inputs, tf.convert_to_tensor(labels)

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'generates data batches'
    def __init__(self, class_0, class_1, embed_len=180, batch_size=32, shuffle=True):
        'Initialization'
        self.text = class_0 + class_1
        self.labels = [0]*len(class_0) + [1]*len(class_1)
        self.embed_len = embed_len
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.count = len(self.text)
        self.indexes = list(range(self.count))
        self.data = [None]*len(self.text)
        self.on_epoch_end()
        
        if self.shuffle:
            random.shuffle(self.indexes)

    def __len__(self):
        'Denotes the number of batches per epoch'
        return self.count // self.batch_size

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        inputs = {
            'input_word_ids': [],
            'input_mask': [],
            'input_type_ids': []}
        
        outputs = []
        
        for i in indexes:
            if not self.data[i]:
                self.data[i] = encode_pair(self.text[i][0], self.text[i][1], self.embed_len)
            for key in inputs:
                inputs[key] += [self.data[i][key]]
            outputs.append(self.labels[i])
            
        for key in inputs:
            inputs[key] = tf.ragged.constant(inputs[key], inner_shape=(self.batch_size, self.embed_len))
        outputs = tf.convert_to_tensor(outputs)

        return inputs, outputs
    
    def get_dataset(self):
        inputs = {
            'input_word_ids': [],
            'input_mask': [],
            'input_type_ids': []}
        
        outputs = []
        
        for i in range(len(self.text)):
            if not self.data[i]:
                self.data[i] = encode_pair(self.text[i][0], self.text[i][1], self.embed_len)
            for key in inputs:
                inputs[key] += [self.data[i][key]]
            outputs.append(self.labels[i])
            
        return inputs, outputs

In [None]:
# story batch + epoch results and dump to file if name is not empty
class HistoryCallback(tf.keras.callbacks.Callback):

    def __init__(self, file_name, history={'epoch': [], 'batch': []}):
        self.history = history
        self.name = file_name
        self.epoch = None

    def on_epoch_begin(self, epoch, logs=None):
        self.epoch = epoch + 1
        
    def on_epoch_end(self, epoch, logs=None):
        self.epoch = epoch + 1
        logs['epoch'] = self.epoch
        self.history['epoch'].append(logs)
        
        if self.name:
            json_save(self.name, self.history)

    def on_train_batch_end(self, batch, logs=None):
        if logs and batch:
            logs['batch'] = batch
            logs['epoch'] = self.epoch
            self.history['batch'].append(logs)

In [None]:
# create dataset for textual representation
def create_train_set_samples(data):
    class_0 = []
    class_1 = []
    
    for k, v in data.items():
        if v:
            for i in v['generated']['right']:
                class_1.append([v['vanilla']['question'], i['text'], 1])
            for i in v['generated']['wrong']:
                class_0.append([v['vanilla']['question'], i['text'], 0])

    return class_0, class_1, 'quanswer'

In [None]:
# create dataset for concatenated labels
def create_train_set_samples(data):
    class_0 = []
    class_1 = []
    
    for k, v in data.items():
        if v:
            for i in v['generated']['right']:
                class_1.append([v['vanilla']['question'], ' '.join(i['labels']), 1])
            for i in v['generated']['wrong']:
                class_0.append([v['vanilla']['question'], ' '.join(i['labels']), 0])

    return class_0, class_1, 'labels'

In [None]:
data_file = data_path + ['vanilla_qanswer_results']

data = json_load(os.path.join(os.sep, *data_file))

In [None]:
class_0, class_1, dataset_name = create_train_set_samples(data)
print(dataset_name)
print(len(class_0))
print(len(class_1))

In [None]:
file_pretrained = ['pretrained', 'bert_classifier.h5']

def create_model(bert_classifier, epochs=100, batch_size=8, batches_per_epoch=1000, warmup_epochs=5):
    num_train_steps = epochs*batch_size*batches_per_epoch
    warmup_steps = batches_per_epoch*warmup_epochs

    # creates an optimizer with learning rate schedule
    optimizer = nlp.optimization.create_optimizer(
        2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

    metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    bert_classifier.compile(
        optimizer=optimizer,
        loss=loss,
        metrics=metrics)
    
    bert_classifier.load_weights(os.path.join(*file_pretrained))

In [None]:
def train_model(bert_classifier, 
                class_0, class_1, 
                valid_0, valid_1, 
                name,
                epochs=100, 
                batch_size=8, 
                batches_per_epoch=1000, 
                warmup_epochs=5,
                embed=96):
    DG_train = DataGenerator(class_0, class_1, embed_len=embed, batch_size=batch_size)
    valid_set, valid_labels = bert_encode(valid_0, valid_1, tokenizer)    

    history = HistoryCallback(file_name=name, history={'epoch': [], 'batch': []})
    
    bert_classifier.fit(
        DG_train,
        steps_per_epoch=batches_per_epoch,
        validation_data=(valid_set, valid_labels),
        batch_size=batch_size,
        epochs=epochs,
        callbacks=[
            history,
            tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                             mode='max',
                                             patience=5,
                                             restore_best_weights=True)
        ],
    )    

In [None]:
number = 10

bert_classifier, bert_encoder = bert_models.classifier_model(bert_config, num_labels=2)
create_model(bert_classifier, batch_size=8, batches_per_epoch=1000)

len_0 = len(class_0)
len_1 = len(class_1)
valid_0, valid_1 = class_0[:len_0 // 10], class_1[:len_1 // 10]
class_0, class_1 = class_0[len_0 // 10:], class_1[len_1 // 10:]

train_model(bert_classifier, class_0, class_1, valid_0, valid_1, f'h/{dataset_name}-{number}',
            batch_size=8, batches_per_epoch=1000, embed=256)

bert_classifier.save_weights(f'h/{dataset_name}-{number}.h5')