### Use BERT to score a blank-filling option
### Model based on BERT_Cloze

## 1. Loading and Initializing

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
print("Using Tensorflow version: " + tf.__version__)
print(tf.config.list_physical_devices('GPU'))

BERT_DIR = "/home/aufish/Downloads/bert"

Using Tensorflow version: 2.1.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
bert_module = hub.KerasLayer(BERT_DIR, trainable=True)

In [7]:
# tokenizer
from bert import tokenization

def create_tokenizer(vocab_file, do_lower_case=False):
    return tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer(BERT_DIR + "/assets/vocab.txt")

def convert_sentence_to_features(sentence, tokenizer, max_seq_len=50):
    tokens = ['[CLS]']
    tokens.extend(tokenizer.tokenize(sentence))
    if len(tokens) > max_seq_len-1:
        tokens = tokens[:max_seq_len-1]
    tokens.append('[SEP]')
    
    segment_ids = [0] * len(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    #Zero Mask till seq_length
    zero_mask = [0] * (max_seq_len-len(tokens))
    input_ids.extend(zero_mask)
    input_mask.extend(zero_mask)
    segment_ids.extend(zero_mask)
    
    return input_ids, input_mask, segment_ids
    return input_ids, input_mask, segment_ids

def convert_sentences_to_features(sentences, tokenizer, max_seq_len=50):
    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    
    for sentence in sentences:
        input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
        all_input_ids.append(input_ids)
        all_input_mask.append(input_mask)
        all_segment_ids.append(segment_ids)
    
    return all_input_ids, all_input_mask, all_segment_ids

import random, copy
import numpy as np
MASK_ID = tokenizer.convert_tokens_to_ids(['[MASK]'])[0]
def make_mask(input_ids, input_mask, mask_loc):
    ''' 
    Only make mask for one sentence
    input_ids: the ids of words in the sentences
    input_mask: initial mask (1 if there is a word; 0 for padding)
    mask_loc: which word should be masked, notice that the first in-sentence word has index 1
    returns
    input_id: the word specified by mask_loc is replaced by [MASK]
    input_mask: replace one bit of 1 with 0, meaning that the word will be masked
    original_word: the word that is replaced
    '''
    
    new_input_mask = copy.deepcopy(input_mask)
    new_input_ids  = copy.deepcopy(input_ids)
    
    for i in range(len(input_ids)):
    original_word = input_ids[mask_loc]
    
    new_input_ids[mask_loc] = MASK_ID
    
    assert new_input_mask[mask_loc] == 1
    new_input_mask[mask_loc] = 0
                
    return new_input_ids, new_input_mask, original_word

### make mask sanity test

In [14]:
input_ids, input_mask, segmend_ids = convert_sentence_to_features("I love you", tokenizer)

In [19]:
new_input_ids, new_input_mask, masked_word = make_mask(input_ids, input_mask, 1)
masked_word

146

### 1.2 Blank filler model (from BERT_Cloze)

In [25]:
class WordPredictor(tf.keras.Model):
    # The output means, how possible the given word may fit into the blank
    def __init__(self, class_num, bert=bert_module, dropout=0.1):
        super(WordPredictor, self).__init__()
        self.bert = bert
        self.drop = tf.keras.layers.Dropout(rate=dropout, trainable=True)
        
        self.dense = tf.keras.layers.Dense(
            class_num,
            activation=None,
            kernel_initializer='glorot_uniform',
            name='word_prediction',
            trainable=True)
        
    def call(self, inputs, mask_loc):
        # When passed in, all tensors are stacked in one, split it into a list
        # inputs = tf.unstack(tf.cast(inputs, tf.dtypes.int32), axis=1)

        pooled, sequential = self.bert(inputs)
        
        # select one from each batch
        s = tf.gather_nd(sequential, [(i, mask_loc[i]) for i in range(sequential.shape[0])])
        # s now has shape (batch_size * 768)
        
        x = self.drop(s)
        return self.dense(x)

### 1.2.1 model calling sanity test

In [39]:
model = WordPredictor(len(tokenizer.vocab))
model.load_weights("./word_predictor_9")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f702c229eb8>

In [45]:
mask_loc = 2
input_ids, input_mask, segment_ids = convert_sentence_to_features(["I love you", "I love you too"], tokenizer)
new_input_ids, new_input_mask, original_word = make_mask(input_ids, input_mask, mask_loc)

output = model([[new_input_ids], [new_input_mask], [segment_ids]], [mask_loc])
print(tf.argsort(output, axis=1, direction='DESCENDING')[0, :10])

tf.Tensor([1587 1328 2810 1567 1500 2367 6243 1176 3496 3683], shape=(10,), dtype=int32)


In [46]:
tokenizer.convert_ids_to_tokens([1587, 1328, 2810, 1567, 1500, 2367, 6243, 1176, 3496, 3683])

['tell',
 'want',
 'hope',
 'love',
 'told',
 'ask',
 'thank',
 'like',
 'trust',
 'wish']

## 2. Scorer

### 2.1 To better design scorer, first investigate what the output is from ScratchGan

In [52]:
from sys import path
path.append('/home/aufish/Documents/ScratchGan++')

from scratchgan.generators import LSTMGen

In [54]:
tokenizer.convert_tokens_to_ids(['[PAD]'])
tokenizer.convert_ids_to_tokens([0])

['[PAD]']

In [None]:
default_config = {
    'vocab_size' : len(tokenizer.vocab),
    'gen_feature_size' : 512,
    'num_layers_gen' : [512] * 2,
    'max_seq_length' : 50,
    'batch_size' : 512,
    'layer_norm_gen' : False,
    'trainable_embedding_size' : 64,
    'gen_input_dropout' : 0.0,
    'gen_output_dropout' : 0.0,
    'pad_int' : 0,
    'embedding_source' : ,
    'vocab_file' : 
}


  gen = generators.LSTMGen(
      vocab_size=vocab_size,
      feature_sizes=[config.gen_feature_size] * config.num_layers_gen,
      max_sequence_length=reader.MAX_TOKENS_SEQUENCE[config.dataset],
      batch_size=config.batch_size,
      use_layer_norm=config.layer_norm_gen,
      trainable_embedding_size=config.trainable_embedding_size,
      input_dropout=config.gen_input_dropout,
      output_dropout=config.gen_output_dropout,
      pad_token=reader.PAD_INT,
      embedding_source=embedding_source,
      vocab_file=vocab_file,
  )

gen = generators.LSTMGen(
      vocab_size=vocab_size,
      feature_sizes=[config.gen_feature_size] * config.num_layers_gen,
      max_sequence_length=reader.MAX_TOKENS_SEQUENCE[config.dataset],
      batch_size=config.batch_size,
      use_layer_norm=config.layer_norm_gen,
      trainable_embedding_size=config.trainable_embedding_size,
      input_dropout=config.gen_input_dropout,
      output_dropout=config.gen_output_dropout,
      pad_token=reader.PAD_INT,
      embedding_source=embedding_source,
      vocab_file=vocab_file,
  )

In [None]:
def score_sentences(model, sentences):
    # Given a list sentence, use WordPredictor to estimate how good each word is
    # returned result should be a tensor with dimension sentence_num x seq_length
    input_ids, input_mask, segment_ids = convert_sentences_to_features(sentences, tokenizer)
    for i in range(len(input_ids) - 2):
        # minus 2 because starting and ending tokens do not need to be masked
        new_input_ids, new_input_mask, original_word = make_mask(input_ids, input_mask, i)
        
        output = model([[new_input_ids], [new_input_mask]])