## 1. Loading BERT and experimenting

### 1.1 Define functions for tokenization

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
print("Using Tensorflow version: " + tf.__version__)
print(tf.config.list_physical_devices('GPU'))

BERT_DIR = "/home/aufish/Downloads/bert"

Using Tensorflow version: 2.1.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
# try with TF2 SavedModel
# The online downloading method does not work, use pre-downloaded module
# bert_module = hub.Module("https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/1")

bert_module = hub.KerasLayer(BERT_DIR, trainable=True)

In [3]:
# tokenizer
from bert import tokenization

def create_tokenizer(vocab_file, do_lower_case=False):
    return tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer(BERT_DIR + "/assets/vocab.txt")

In [4]:
# These functions are actually not used
def convert_sentence_to_features(sentence, tokenizer, max_seq_len):
    tokens = ['[CLS]']
    tokens.extend(tokenizer.tokenize(sentence))
    if len(tokens) > max_seq_len-1:
        tokens = tokens[:max_seq_len-1]
    tokens.append('[SEP]')
    
    segment_ids = [0] * len(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    #Zero Mask till seq_length
    zero_mask = [0] * (max_seq_len-len(tokens))
    input_ids.extend(zero_mask)
    input_mask.extend(zero_mask)
    segment_ids.extend(zero_mask)
    
    return input_ids, input_mask, segment_ids

def convert_sentences_to_features(sentences, tokenizer, max_seq_len=50):
    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    
    for sentence in sentences:
        input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
        all_input_ids.append(input_ids)
        all_input_mask.append(input_mask)
        all_segment_ids.append(segment_ids)
    
    return all_input_ids, all_input_mask, all_segment_ids

### 1.1 Trial run for methods above

In [5]:
sentences = ['I prefer Python over Java', 'I love ice cream the best']
input_ids_vals, input_mask_vals, segment_ids_vals = convert_sentences_to_features(sentences, tokenizer, 20)

In [10]:
# all 1 in mask
bert_inputs = [input_ids_vals, input_mask_vals, segment_ids_vals]

print(input_ids_vals)
print(input_mask_vals)
print(segment_ids_vals)

out = bert_module (bert_inputs)

print(out[0])
print(out[1])

[[101, 146, 9353, 23334, 1166, 9155, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 146, 1567, 2854, 7081, 1103, 1436, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
tf.Tensor(
[[-0.8131681   0.5370047   0.99996513 ...  0.999987   -0.5378665
   0.9919909 ]
 [-0.60621274  0.49540225  0.99991035 ...  0.9999771  -0.91028047
   0.9931733 ]], shape=(2, 768), dtype=float32)
tf.Tensor(
[[[ 0.8713086   0.00939399  0.02123968 ... -0.2102821   0.45887846
    0.23217922]
  [ 0.5732106  -0.13002536  0.33161387 ...  0.05539769 -0.10422937
    0.3469553 ]
  [ 0.42146546 -0.2859141  -0.23261456 ...  0.25038135 -0.60215116
    0.20282936]
  ...
  [ 0.37351245  0.26639333  0.43967193 ... -0.1845916   0.40790993
    0.4635392 ]
  [ 0.38079703  0.18587998  0.2

In [11]:
s = out[1]
tf.gather_nd(s, [[0, 2], [1, 5]])

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[ 0.42146546, -0.2859141 , -0.23261456, ...,  0.25038135,
        -0.60215116,  0.20282936],
       [-0.5769223 , -0.00628331,  0.02436075, ...,  0.24750945,
         0.01002553,  0.13241625]], dtype=float32)>

In [None]:
# change value for mask of one word
import copy

input_mask_val_2 = copy.deepcopy(input_mask_vals)
input_mask_val_2[0][0] = 0

bert_inputs = [input_ids_vals, input_mask_val_2, segment_ids_vals]

print(input_ids_vals)
print(input_mask_val_2)
print(segment_ids_vals)

out = bert_module (bert_inputs)

print(out[0])

## 2. Create classifier model keras model

### 2.1 Add a layer to define predictor

In [21]:
class WordPredictor(tf.keras.Model):
    def __init__(self, bert_layer, class_num, drop_out=0.1):
        super(WordPredictor, self).__init__()
        self.bert = bert_layer
        self.drop1 = tf.keras.layers.Dropout(rate=drop_out)
        self.dense1= tf.keras.layers.Dense(
            768,
            activation=tf.keras.activations.relu,
            kernel_initializer='glorot_uniform',
            name='predictions/transform/hidden')
        
        self.drop2 = tf.keras.layers.Dropout(rate=drop_out)
        self.dense2 = tf.keras.layers.Dense(
            class_num,
            activation = None, 
            kernel_initializer='glorot_uniform',
            name='predictions/transform/final')
        
    @tf.function
    def call(self, inputs):
        assert len(inputs) == 2
        
        # masked_word_ids should be in the format [ [sentence_id, word_id], ... ]
        bert_layer_input, masked_word_ids = inputs
        pooled, sequential = self.bert(bert_layer_input)
        
        x = tf.gather_nd(sequential, masked_word_ids)
        # use sequential instead of pool
        x = self.drop1(x)
        x = self.dense1(x)
        x = self.drop2(x)
        return self.dense2(x)

### 2.2 Sanity test for model

In [22]:
# Sanity test
model = WordPredictor(bert_module, len(tokenizer.vocab))

input_with_id = [bert_inputs, [[i, 0] for i in range(len(bert_inputs))]]
print(len(input_with_id))
model(input_with_id)
for weight in model.trainable_weights:
    print(weight.name)


2
bert_model/word_embeddings/embeddings:0
bert_model/embedding_postprocessor/type_embeddings:0
bert_model/embedding_postprocessor/position_embeddings:0
bert_model/embedding_postprocessor/layer_norm/gamma:0
bert_model/embedding_postprocessor/layer_norm/beta:0
bert_model/encoder/layer_0/self_attention/query/kernel:0
bert_model/encoder/layer_0/self_attention/query/bias:0
bert_model/encoder/layer_0/self_attention/key/kernel:0
bert_model/encoder/layer_0/self_attention/key/bias:0
bert_model/encoder/layer_0/self_attention/value/kernel:0
bert_model/encoder/layer_0/self_attention/value/bias:0
bert_model/encoder/layer_0/self_attention_output/kernel:0
bert_model/encoder/layer_0/self_attention_output/bias:0
bert_model/encoder/layer_0/self_attention_layer_norm/gamma:0
bert_model/encoder/layer_0/self_attention_layer_norm/beta:0
bert_model/encoder/layer_0/intermediate/kernel:0
bert_model/encoder/layer_0/intermediate/bias:0
bert_model/encoder/layer_0/output/kernel:0
bert_model/encoder/layer_0/output/b

## 3. Train BERT for Masked-word Predition

### 3.1 Util function to randomly mask a word

In [25]:
import random, copy
import numpy as np
def make_rand_mask(input_ids, input_mask, vocab_size, segment_id_vals=None):
    ''' 
    input_ids: the ids of words in the sentences
    input_mask: initial mask (1 if there is a word; 0 for padding)
    returns
    input_mask: replace one bit of 1 with 0, meaning that the word will be masked
    mask_word_ids: the id of words that are masked
    pure_ids: ids in number instead of one-hot (to generate weights per masked word)
    segment_id_vals: mark the masked word with segment id 1
    '''
    batch_size = len(input_ids)
    
    new_input_mask = copy.deepcopy(input_mask)
    mask_word_ids = np.zeros((batch_size, vocab_size))
    pure_ids = []
    segment_encodings = []
    for i in range(batch_size):
        total_word = sum(input_mask[i])
        mask_word = random.randint(0, total_word-1)
        
        pure_ids.append(input_ids[i][mask_word])
        assert new_input_mask[i][mask_word] == 1
        new_input_mask[i][mask_word] = 0
        mask_word_ids[i][input_ids[i][mask_word]] = 1.0
        
        # Make the masked word segment id 1
        # assert segment_id_vals[i][mask_word] == 0
        # segment_id_vals[i][mask_word] = 1
                
    return new_input_mask, tf.convert_to_tensor(mask_word_ids, dtype=tf.dtypes.float32), pure_ids, segment_id_vals

### 3.2 Test masking function

In [26]:
input_masks, labels, pure_ids, segment_ids_vals = make_rand_mask(input_ids_vals, input_mask_vals, len(tokenizer.vocab), segment_ids_vals)

In [27]:
bert_inputs = [input_ids_vals, input_masks, segment_ids_vals]
masked_word_filter = [[i, pure_ids[i]] for i in range(len(pure_ids))]
result = model( [bert_inputs, masked_word_filter] )

### 3.3 Test gradient descent

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
total_loss = 0
with tf.GradientTape() as tape:
    result = model(bert_inputs)
    loss = tf.nn.softmax_cross_entropy_with_logits(labels, result)
    total_loss += loss
grads = tape.gradient(loss, model.trainable_weights)

In [None]:
assert len(grads) == len(model.trainable_weights)
print(tf.reduce_sum(total_loss))

In [None]:
opt.apply_gradients([(grads[i], model.trainable_weights[i]) for i in range(len(grads))])

## 4. Load data from ScratchGan dataset

### 4.1 Load data

In [69]:
import json

DATA_FILE = "/home/aufish/Documents/ScratchGan++/scratchgan/emnlp_data/train.json"
all_sentences = json.load(open(DATA_FILE, "r"))

all_sentences = [sentence['s'] for sentence in all_sentences]
print(all_sentences[0])

My sources have suggested that so far the company sees no reason to change its tax structures , which are perfectly legal .


In [70]:
# To mitigate unbalanced weights, count different words
from collections import defaultdict

count = defaultdict(int)

max_id, max_count = 0, 0
total_count = 0
for sentence in all_sentences:
    ids, _, _ = convert_sentences_to_features([sentence], tokenizer)
    id_list = ids[0]
    for id in id_list:
        count[id] += 1
        total_count += 1
        if count[id] > max_count:
            max_id = id
            max_count = count[id]

In [71]:
print("Data size: {}".format(len(all_sentences)))
print("Number of words: {}".format(total_count))
print("Most frequent id: {}".format(max_id))

Data size: 268586
Number of words: 13429300
Most frequent id: 0


In [72]:
tokenizer.convert_ids_to_tokens([0])

['[PAD]']

### 4.2 Define training and evaluation functions

In [76]:
def train_word_predictor(model, all_sentences, tokenizer, batch_size = 1, epoch = 1):
    opt = tf.keras.optimizers.Adam(learning_rate=1e-5)
    data_size   = len(all_sentences)
    print("Data size: {}".format(data_size))
    for i in range(epoch):
        print("Starting epoch {}".format(i))
        
        # Use the first 1000 sentences for evaluation
        start_index = 0
        end_index   = start_index + batch_size
        
        total_loss = 0
        
        while start_index < data_size:
            if start_index % 1000 == 0:
                print("Sentence index: {}\r".format(start_index))
                
                print("Total loss: {}".format(total_loss))
                total_loss = 0

                accuracy = eval_accuracy(model, all_sentences[:1000], tokenizer)
                print("Accuracy: {}".format(accuracy))
            end_index = min(data_size, start_index + batch_size)
            
            input_ids_vals, input_mask_vals, segment_ids_vals = \
                convert_sentences_to_features(all_sentences[start_index:end_index], tokenizer)
            
            input_masks, labels, masked_ids, segment_ids_vals = \
                make_rand_mask(input_ids_vals, input_mask_vals, len(tokenizer.vocab), segment_ids_vals)
    
            bert_inputs = [input_ids_vals, input_masks, segment_ids_vals]
            weights = tf.convert_to_tensor(np.array([1 / count[id] for id in masked_ids]))
    
            masked_word_filter = [[i, masked_ids[i]] for i in range(len(masked_ids))]
            with tf.GradientTape() as tape:
                result = model( (bert_inputs, masked_word_filter) )
                loss = tf.compat.v1.losses.softmax_cross_entropy(labels, result, weights=weights)
                total_loss += tf.reduce_sum(loss)
            grads = tape.gradient(loss, model.trainable_weights)
            
            opt.apply_gradients(zip(grads, model.trainable_weights))
            
            start_index = end_index

In [77]:
def eval_accuracy(model, all_sentences, tokenizer):
    correct_num = 0
    start_index = 0
    batch_size = 10
    end_index  = start_index + batch_size
    
    while start_index < len(all_sentences):
        end_index = min(len(all_sentences), start_index + batch_size)
        input_ids_vals, input_mask_vals, segment_ids_vals = \
            convert_sentences_to_features(all_sentences[start_index:end_index], tokenizer)

        input_masks, labels, pure_ids, segment_ids_vals = \
            make_rand_mask(input_ids_vals, input_mask_vals, len(tokenizer.vocab), segment_ids_vals)

        masked_word_filter = [[i, pure_ids[i]] for i in range(len(pure_ids))]
        bert_inputs = [input_ids_vals, input_masks, segment_ids_vals]

        result = model( (bert_inputs, masked_word_filter) )

        model_choices = tf.argmax(result, axis=1)
        labels = tf.argmax(labels, axis=1)

        equal_result = tf.equal(model_choices, labels)
        correct_num += tf.reduce_sum(tf.cast(equal_result , tf.int32))
        
        start_index = end_index
    
    return correct_num / len(all_sentences)

### 4.3 Training with model

In [None]:
bert_module = hub.KerasLayer(BERT_DIR, trainable=True)
model = WordPredictor(bert_module, len(tokenizer.vocab))

train_word_predictor(model, all_sentences, tokenizer, batch_size=10, epoch=1)

Data size: 268586
Starting epoch 0
Sentence index: 0
Total loss: 0
Accuracy: 0.0
Sentence index: 1000
Total loss: 0.7254385948181152
Accuracy: 0.02
Sentence index: 2000
Total loss: 0.8310202360153198
Accuracy: 0.019
Sentence index: 3000
Total loss: 0.7174498438835144
Accuracy: 0.034
Sentence index: 4000
Total loss: 0.706411600112915
Accuracy: 0.021
Sentence index: 5000
Total loss: 0.7249010801315308
Accuracy: 0.035
Sentence index: 6000
Total loss: 0.7465704083442688
Accuracy: 0.027
Sentence index: 7000
Total loss: 0.6836177110671997
Accuracy: 0.048
Sentence index: 8000
Total loss: 0.7154678702354431
Accuracy: 0.034
Sentence index: 9000
Total loss: 0.7998234629631042
Accuracy: 0.034
Sentence index: 10000
Total loss: 0.7788370251655579
Accuracy: 0.032
Sentence index: 11000
Total loss: 0.5717579126358032
Accuracy: 0.035
Sentence index: 12000
Total loss: 0.7386078834533691
Accuracy: 0.038
Sentence index: 13000
Total loss: 0.7186319231987
Accuracy: 0.032
Sentence index: 14000
Total loss: 0.

Sentence index: 119000
Total loss: 0.6968228816986084
Accuracy: 0.039
Sentence index: 120000
Total loss: 0.6684834957122803
Accuracy: 0.039
Sentence index: 121000
Total loss: 0.7644031643867493
Accuracy: 0.045
Sentence index: 122000
Total loss: 0.7840768694877625
Accuracy: 0.035
Sentence index: 123000
Total loss: 0.7109640836715698
Accuracy: 0.041
Sentence index: 124000
Total loss: 0.6807681918144226
Accuracy: 0.049
Sentence index: 125000
Total loss: 0.7106829881668091
Accuracy: 0.028
Sentence index: 126000
Total loss: 0.7672754526138306
Accuracy: 0.044
Sentence index: 127000
Total loss: 0.7463210225105286
Accuracy: 0.042
Sentence index: 128000
Total loss: 0.7222540378570557
Accuracy: 0.032
Sentence index: 129000
Total loss: 0.612622082233429
Accuracy: 0.041
Sentence index: 130000
Total loss: 0.7028218507766724
Accuracy: 0.031
Sentence index: 131000
Total loss: 0.746254026889801
Accuracy: 0.032
Sentence index: 132000
Total loss: 0.7208974957466125
Accuracy: 0.036
Sentence index: 133000

In [None]:
model.save_weights("./trained_model_v2")

In [None]:
# model2 = WordPredictor(bert_module, len(tokenizer.vocab))
model.load_weights("./trained_model_v2")

In [None]:
model(bert_inputs)

### 4.4 Qualitatively check predicting result of BERT

In [64]:
all_sentences = ["My sources have suggested that so far the company sees no reason to change its tax structures , which are perfectly legal ."]
input_ids_vals, input_mask_vals, segment_ids_vals = \
    convert_sentences_to_features(all_sentences, tokenizer)

input_masks, labels, pure_ids, segment_ids_vals = \
    make_rand_mask(input_ids_vals, input_mask_vals, len(tokenizer.vocab), segment_ids_vals)

bert_inputs = [input_ids_vals, input_masks, segment_ids_vals]
masked_ids  = [[i, pure_ids[i]] for i in range(len(pure_ids))]
result = model((bert_inputs, masked_ids))

In [65]:
print(input_masks)
print(tf.argmax(labels, axis=1))

[[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
tf.Tensor([1103], shape=(1,), dtype=int64)


In [66]:
print("Missing word: {}".format(tokenizer.convert_ids_to_tokens([1103])))

Missing word: ['the']


In [67]:
tf.argsort(result, axis=1, direction="DESCENDING")[0, :10]

<tf.Tensor: shape=(10,), dtype=int32, numpy=
array([6150, 1115, 1177, 2732, 1849, 1185, 5302,  117, 1134, 1419],
      dtype=int32)>

In [68]:
candidates = [6150, 1115, 1177, 2732, 1849, 1185, 5302,  117, 1134, 1419]
for candidate in candidates:
    print(result[0, candidate])
    print(tokenizer.convert_ids_to_tokens([candidate]))

tf.Tensor(0.0002001503, shape=(), dtype=float32)
['perfectly']
tf.Tensor(0.00018320294, shape=(), dtype=float32)
['that']
tf.Tensor(0.00016705593, shape=(), dtype=float32)
['so']
tf.Tensor(0.00012558168, shape=(), dtype=float32)
['legal']
tf.Tensor(0.00010918726, shape=(), dtype=float32)
['change']
tf.Tensor(0.00010350955, shape=(), dtype=float32)
['no']
tf.Tensor(9.283192e-05, shape=(), dtype=float32)
['sees']
tf.Tensor(8.091487e-05, shape=(), dtype=float32)
[',']
tf.Tensor(7.809748e-05, shape=(), dtype=float32)
['which']
tf.Tensor(7.575437e-05, shape=(), dtype=float32)
['company']
