## 1. Loading BERT and experimenting

### 1.1 Define functions for tokenization

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
print("Using Tensorflow version: " + tf.__version__)
print(tf.config.list_physical_devices('GPU'))

BERT_DIR = "/home/aufish/Downloads/bert"

Using Tensorflow version: 2.1.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
# try with TF2 SavedModel
# The online downloading method does not work, use pre-downloaded module
# bert_module = hub.Module("https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/1")

bert_module = hub.KerasLayer(BERT_DIR, trainable=True)

In [3]:
# tokenizer
from bert import tokenization

def create_tokenizer(vocab_file, do_lower_case=False):
    return tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer(BERT_DIR + "/assets/vocab.txt")

In [4]:
def convert_sentence_to_features(sentence, tokenizer, max_seq_len):
    tokens = ['[CLS]']
    tokens.extend(tokenizer.tokenize(sentence))
    if len(tokens) > max_seq_len-1:
        tokens = tokens[:max_seq_len-1]
    tokens.append('[SEP]')
    
    segment_ids = [0] * len(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    #Zero Mask till seq_length
    zero_mask = [0] * (max_seq_len-len(tokens))
    input_ids.extend(zero_mask)
    input_mask.extend(zero_mask)
    segment_ids.extend(zero_mask)
    
    return input_ids, input_mask, segment_ids

def convert_sentences_to_features(sentences, tokenizer, max_seq_len=50):
    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    
    for sentence in sentences:
        input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
        all_input_ids.append(input_ids)
        all_input_mask.append(input_mask)
        all_segment_ids.append(segment_ids)
    
    return all_input_ids, all_input_mask, all_segment_ids

### 1.1 Trial run for methods above

In [None]:
sentences = ['I prefer Python over Java', 'I love ice cream the best']
input_ids_vals, input_mask_vals, segment_ids_vals = convert_sentences_to_features(sentences, tokenizer, 20)

In [None]:
# all 1 in mask
bert_inputs = [input_ids_vals, input_mask_vals, segment_ids_vals]

print(input_ids_vals)
print(input_mask_vals)
print(segment_ids_vals)

out = bert_module (bert_inputs)

print(out[0])

In [None]:
# change value for mask of one word
import copy

input_mask_val_2 = copy.deepcopy(input_mask_vals)
input_mask_val_2[0][0] = 0

bert_inputs = [input_ids_vals, input_mask_val_2, segment_ids_vals]

print(input_ids_vals)
print(input_mask_val_2)
print(segment_ids_vals)

out = bert_module (bert_inputs)

print(out[0])

## 2. Create classifier model keras model

### 2.1 Add a layer to define predictor

In [5]:
class WordPredictor(tf.keras.Model):
    def __init__(self, bert_layer, class_num, drop_out=0.1):
        super(WordPredictor, self).__init__()
        self.bert = bert_layer
        self.drop = tf.keras.layers.Dropout(rate=drop_out)
        self.dense= tf.keras.layers.Dense(
            class_num,
            activation=None,
            kernel_initializer='glorot_uniform',
            name='predictions/transform/logits')
        
    def call(self, inputs):
        pooled, sequential = self.bert(inputs)
        x = self.drop(pooled)
        return self.dense(x)

### 2.2 Sanity test for model

In [None]:
# Sanity test
model = WordPredictor(bert_module, len(tokenizer.vocab))
model(bert_inputs)
model.trainable_weights

## 3. Train BERT for Masked-word Predition

### 3.1 Util function to randomly mask a word

In [6]:
import random, copy
import numpy as np
def make_rand_mask(input_ids, input_mask, vocab_size, segment_id_vals=None):
    ''' 
    input_ids: the ids of words in the sentences
    input_mask: initial mask (1 if there is a word; 0 for padding)
    returns
    input_mask: replace one bit of 1 with 0, meaning that the word will be masked
    mask_word_ids: the id of words that are masked
    pure_ids: ids in number instead of one-hot (to generate weights per masked word)
    segment_id_vals: mark the masked word with segment id 1
    '''
    batch_size = len(input_ids)
    
    new_input_mask = copy.deepcopy(input_mask)
    mask_word_ids = np.zeros((batch_size, vocab_size))
    pure_ids = []
    segment_encodings = []
    for i in range(batch_size):
        total_word = sum(input_mask[i])
        mask_word = random.randint(0, total_word-1)
        
        pure_ids.append(input_ids[i][mask_word])
        assert new_input_mask[i][mask_word] == 1
        new_input_mask[i][mask_word] = 0
        mask_word_ids[i][input_ids[i][mask_word]] = 1.0
        
        # Make the masked word segment id 1
        assert segmend_id_vals[i][mask_word] = 0
        segment_id_vals[i][mask_word] = 1
                
    return new_input_mask, tf.convert_to_tensor(mask_word_ids, dtype=tf.dtypes.float32), pure_ids, segment_id_vals

### 3.2 Test masking function

In [None]:
input_masks, labels, _, segment_ids_vals = make_rand_mask(input_ids_vals, input_mask_vals, len(tokenizer.vocab), segment_ids_vals)

In [None]:
bert_inputs = [input_ids_vals, input_masks, segment_ids_vals]
result = model(bert_inputs)

### 3.3 Test gradient descent

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
total_loss = 0
with tf.GradientTape() as tape:
    result = model(bert_inputs)
    loss = tf.nn.softmax_cross_entropy_with_logits(labels, result)
    total_loss += loss
grads = tape.gradient(loss, model.trainable_weights)

In [None]:
assert len(grads) == len(model.trainable_weights)
print(tf.reduce_sum(total_loss))

In [None]:
opt.apply_gradients([(grads[i], model.trainable_weights[i]) for i in range(len(grads))])

## 4. Load data from ScratchGan dataset

### 4.1 Load data

In [7]:
import json

DATA_FILE = "/home/aufish/Documents/ScratchGan++/scratchgan/emnlp_data/train.json"
all_sentences = json.load(open(DATA_FILE, "r"))

all_sentences = [sentence['s'] for sentence in all_sentences]
print(all_sentences[0])

My sources have suggested that so far the company sees no reason to change its tax structures , which are perfectly legal .


In [8]:
# To mitigate unbalanced weights, count different words
from collections import defaultdict

count = defaultdict(int)

max_id, max_count = 0, 0
total_count = 0
for sentence in all_sentences:
    ids, _, _ = convert_sentences_to_features([sentence], tokenizer)
    id_list = ids[0]
    for id in id_list:
        count[id] += 1
        total_count += 1
        if count[id] > max_count:
            max_id = id
            max_count = count[id]

In [9]:
print("Data size: {}".format(len(all_sentences)))
print("Number of words: {}".format(total_count))
print("Most frequent id: {}".format(max_id))

Data size: 268586
Number of words: 13429300
Most frequent id: 0


In [15]:
tokenizer.convert_ids_to_tokens([0])

['[PAD]']

### 4.2 Define training and evaluation functions

In [16]:
def train_word_predictor(model, all_sentences, tokenizer, batch_size = 1, epoch = 1):
    opt = tf.keras.optimizers.Adam(learning_rate=1e-5)
    data_size   = len(all_sentences)
    print("Data size: {}".format(data_size))
    for i in range(epoch):
        print("Starting epoch {}".format(i))
        
        # Use the first 1000 sentences for evaluation
        start_index = 1000
        end_index   = start_index + batch_size
        
        total_loss = 0
        
        while start_index < data_size:
            if start_index % 1000 == 0:
                print("Sentence index: {}\r".format(start_index))
                
                print("Total loss: {}".format(total_loss))
                total_loss = 0

                accuracy = eval_accuracy(model, all_sentences[:1000], tokenizer)
                print("Accuracy: {}".format(accuracy))
            end_index = min(data_size, start_index + batch_size)
            
            input_ids_vals, input_mask_vals, segment_ids_vals = \
                convert_sentences_to_features(all_sentences[start_index:end_index], tokenizer)
            
            input_masks, labels, masked_ids, segment_ids_vals = \
                make_rand_mask(input_ids_vals, input_mask_vals, len(tokenizer.vocab), segment_ids_vals)
    
            bert_inputs = [input_ids_vals, input_masks, segment_ids_vals]
            weights = tf.convert_to_tensor(np.array([1 / count[id] for id in masked_ids]))
    
            with tf.GradientTape() as tape:
                result = model(bert_inputs)
                loss = tf.compat.v1.losses.softmax_cross_entropy(labels, result, weights=weights)
                total_loss += tf.reduce_sum(loss)
            grads = tape.gradient(loss, model.trainable_weights)
            
            opt.apply_gradients([(grads[i], model.trainable_weights[i]) for i in range(len(grads))])
            
            start_index = end_index

In [17]:
def eval_accuracy(model, all_sentences, tokenizer):
    correct_num = 0
    start_index = 0
    batch_size = 10
    end_index  = start_index + batch_size
    
    while start_index < len(all_sentences):
        end_index = min(len(all_sentences), start_index + batch_size)
        input_ids_vals, input_mask_vals, segment_ids_vals = \
            convert_sentences_to_features(all_sentences[start_index:end_index], tokenizer)

        input_masks, labels, _, segment_ids_vals = \
            make_rand_mask(input_ids_vals, input_mask_vals, len(tokenizer.vocab), segment_ids_vals)

        bert_inputs = [input_ids_vals, input_masks, segment_ids_vals]

        result = model(bert_inputs)

        model_choices = tf.argmax(result, axis=1)
        labels = tf.argmax(labels, axis=1)

        equal_result = tf.equal(model_choices, labels)
        correct_num += tf.reduce_sum(tf.cast(equal_result , tf.int32))
        
        start_index = end_index
    
    return correct_num / len(all_sentences)

### 4.3 Training with model

In [18]:
bert_module = hub.KerasLayer(BERT_DIR, trainable=True)
model = WordPredictor(bert_module, len(tokenizer.vocab))

train_word_predictor(model, all_sentences, tokenizer, batch_size=10, epoch=10)

Data size: 2000
Starting epoch 0
Sentence index: 0
Total loss: 0
Accuracy: 0.0
Sentence index: 100
Total loss: 0.08057606220245361
Accuracy: 0.0
Sentence index: 200
Total loss: 0.05446435138583183
Accuracy: 0.0
Sentence index: 300
Total loss: 0.05194595828652382
Accuracy: 0.0
Sentence index: 400
Total loss: 0.10195951908826828
Accuracy: 0.0
Sentence index: 500
Total loss: 0.0800694078207016
Accuracy: 0.0
Sentence index: 600
Total loss: 0.058692026883363724
Accuracy: 0.0
Sentence index: 700
Total loss: 0.05213850736618042
Accuracy: 0.0
Sentence index: 800
Total loss: 0.10485289990901947
Accuracy: 0.0
Sentence index: 900
Total loss: 0.059231966733932495
Accuracy: 0.0
Sentence index: 1000
Total loss: 0.07235340774059296
Accuracy: 0.0
Sentence index: 1100
Total loss: 0.06643466651439667
Accuracy: 0.0
Sentence index: 1200
Total loss: 0.10748433321714401
Accuracy: 0.0
Sentence index: 1300
Total loss: 0.04277251660823822
Accuracy: 0.0
Sentence index: 1400
Total loss: 0.09483958780765533
Accur

Accuracy: 0.075
Sentence index: 100
Total loss: 0.05028447508811951
Accuracy: 0.086
Sentence index: 200
Total loss: 0.06859728693962097
Accuracy: 0.075
Sentence index: 300
Total loss: 0.04195752739906311
Accuracy: 0.068
Sentence index: 400
Total loss: 0.05924701318144798
Accuracy: 0.062
Sentence index: 500
Total loss: 0.07010948657989502
Accuracy: 0.075
Sentence index: 600
Total loss: 0.07927380502223969
Accuracy: 0.052
Sentence index: 700
Total loss: 0.0774344876408577
Accuracy: 0.055
Sentence index: 800
Total loss: 0.051811426877975464
Accuracy: 0.06
Sentence index: 900
Total loss: 0.09306452423334122
Accuracy: 0.079
Sentence index: 1000
Total loss: 0.071143239736557
Accuracy: 0.08
Sentence index: 1100
Total loss: 0.10416867583990097
Accuracy: 0.052
Sentence index: 1200
Total loss: 0.101884625852108
Accuracy: 0.077
Sentence index: 1300
Total loss: 0.043635230511426926
Accuracy: 0.062
Sentence index: 1400
Total loss: 0.11085750162601471
Accuracy: 0.068
Sentence index: 1500
Total loss:

In [None]:
tf.saved_model.save(model, "./trained_model")

In [None]:
model2 = WordPredictor(bert_module, len(tokenizer.vocab))
model3 = tf.saved_model.load("./trained_model")

### 4.4 Qualitatively check predicting result of BERT

In [48]:
all_sentences = ["Governments are urged to take steps to protect the rights of women and children."]
input_ids_vals, input_mask_vals, segment_ids_vals = \
    convert_sentences_to_features(all_sentences, tokenizer)

input_masks, labels, _, segment_ids_vals = \
    make_rand_mask(input_ids_vals, input_mask_vals, len(tokenizer.vocab), segment_ids_vals)

bert_inputs = [input_ids_vals, input_masks, segment_ids_vals]

result = model(bert_inputs)

In [49]:
print(input_masks)
print(tf.argmax(labels, axis=1))

[[1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]
tf.Tensor([1321], shape=(1,), dtype=int64)


In [52]:
print("Missing word: {}".format(tokenizer.convert_ids_to_tokens([1321])))

Missing word: ['take']


In [51]:
tf.argsort(result, axis=1, direction="DESCENDING")[0, :10]

<tf.Tensor: shape=(10,), dtype=int32, numpy=
array([20495,  4986, 25232,   500, 22621, 21363, 22785, 15454, 22321,
       25657], dtype=int32)>

In [53]:
candidates = [20495,  4986, 25232,   500, 22621, 21363, 22785, 15454, 22321,
       25657]
for candidate in candidates:
    print(result[0, candidate])
    print(tokenizer.convert_ids_to_tokens([candidate]))

tf.Tensor(0.7388558, shape=(), dtype=float32)
['Buffy']
tf.Tensor(0.71015686, shape=(), dtype=float32)
['roughly']
tf.Tensor(0.7035765, shape=(), dtype=float32)
['##rogate']
tf.Tensor(0.6961367, shape=(), dtype=float32)
['щ']
tf.Tensor(0.6836092, shape=(), dtype=float32)
['wrestled']
tf.Tensor(0.6822932, shape=(), dtype=float32)
['lending']
tf.Tensor(0.67473334, shape=(), dtype=float32)
['##gets']
tf.Tensor(0.67315876, shape=(), dtype=float32)
['##gnant']
tf.Tensor(0.66435343, shape=(), dtype=float32)
['salute']
tf.Tensor(0.6582221, shape=(), dtype=float32)
['concerto']
