# Loading BERT and experimenting

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
print("Using Tensorflow version: " + tf.__version__)
print(tf.config.list_physical_devices('GPU'))

BERT_DIR = "/home/aufish/Downloads/bert"

Using Tensorflow version: 2.1.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
# try with TF2 SavedModel
# The online downloading method does not work, use pre-downloaded module
# bert_module = hub.Module("https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/1")

bert_module = hub.KerasLayer(BERT_DIR, trainable=True)

In [3]:
# tokenizer
from bert import tokenization

def create_tokenizer(vocab_file, do_lower_case=False):
    return tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer(BERT_DIR + "/assets/vocab.txt")

In [4]:
def convert_sentence_to_features(sentence, tokenizer, max_seq_len):
    tokens = ['[CLS]']
    tokens.extend(tokenizer.tokenize(sentence))
    if len(tokens) > max_seq_len-1:
        tokens = tokens[:max_seq_len-1]
    tokens.append('[SEP]')
    
    segment_ids = [0] * len(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    #Zero Mask till seq_length
    zero_mask = [0] * (max_seq_len-len(tokens))
    input_ids.extend(zero_mask)
    input_mask.extend(zero_mask)
    segment_ids.extend(zero_mask)
    
    return input_ids, input_mask, segment_ids

def convert_sentences_to_features(sentences, tokenizer, max_seq_len=20):
    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    
    for sentence in sentences:
        input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
        all_input_ids.append(input_ids)
        all_input_mask.append(input_mask)
        all_segment_ids.append(segment_ids)
    
    return all_input_ids, all_input_mask, all_segment_ids

In [5]:
sentences = ['I prefer Python over Java', 'I love ice cream the best']
input_ids_vals, input_mask_vals, segment_ids_vals = convert_sentences_to_features(sentences, tokenizer, 20)

In [6]:
# all 1 in mask
bert_inputs = [input_ids_vals, input_mask_vals, segment_ids_vals]

print(input_ids_vals)
print(input_mask_vals)
print(segment_ids_vals)

out = bert_module (bert_inputs)

print(out[0])

[[101, 146, 9353, 23334, 1166, 9155, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 146, 1567, 2854, 7081, 1103, 1436, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
tf.Tensor(
[[-0.8131681   0.5370047   0.99996513 ...  0.999987   -0.5378665
   0.9919909 ]
 [-0.60621274  0.49540225  0.99991035 ...  0.9999771  -0.91028047
   0.9931733 ]], shape=(2, 768), dtype=float32)


In [None]:
# change value for mask of one word
import copy

input_mask_val_2 = copy.deepcopy(input_mask_vals)
input_mask_val_2[0][0] = 0

bert_inputs = [input_ids_vals, input_mask_val_2, segment_ids_vals]

print(input_ids_vals)
print(input_mask_val_2)
print(segment_ids_vals)

out = bert_module (bert_inputs)

print(out[0])

## Create classifier model keras model

In [7]:
class WordPredictor(tf.keras.Model):
    def __init__(self, bert_layer, class_num, drop_out=0.1):
        super(WordPredictor, self).__init__()
        self.bert = bert_layer
        self.drop = tf.keras.layers.Dropout(rate=drop_out)
        self.dense= tf.keras.layers.Dense(
            class_num,
            activation=None,
            kernel_initializer='glorot_uniform',
            name='predictions/transform/logits')
        
    def call(self, inputs):
        pooled, sequential = self.bert(inputs)
        x = self.drop(pooled)
        return self.dense(x)

In [8]:
# Sanity test
model = WordPredictor(bert_module, len(tokenizer.vocab))
model(bert_inputs)
model.trainable_weights

[<tf.Variable 'bert_model/word_embeddings/embeddings:0' shape=(28996, 768) dtype=float32, numpy=
 array([[-0.00054784, -0.04156886,  0.01308366, ..., -0.0038919 ,
         -0.0335485 ,  0.0149841 ],
        [ 0.01688265, -0.03106827,  0.0042053 , ..., -0.01474032,
         -0.03561099, -0.0036223 ],
        [-0.00057234, -0.02673604,  0.00803954, ..., -0.01002474,
         -0.0331164 , -0.01651673],
        ...,
        [-0.00643814,  0.01658491, -0.02035619, ..., -0.04178825,
         -0.049201  ,  0.00416085],
        [-0.00483562, -0.00267701, -0.02901638, ..., -0.05116647,
          0.00449265, -0.01177113],
        [ 0.03134822, -0.02974372, -0.02302896, ..., -0.01454749,
         -0.05249038,  0.02843569]], dtype=float32)>,
 <tf.Variable 'bert_model/embedding_postprocessor/type_embeddings:0' shape=(2, 768) dtype=float32, numpy=
 array([[-3.6974233e-03,  1.7510093e-03, -1.4998456e-05, ...,
          4.1503753e-03, -4.3169265e-03,  2.5677346e-04],
        [-2.4836401e-03, -3.949347

## Train BERT for Masked-word Predition

In [9]:
import random, copy
import numpy as np
def make_rand_mask(input_ids, input_mask, vocab_size):
    ''' 
    input_ids: the ids of words in the sentences
    input_mask: initial mask (1 if there is a word; 0 for padding)
    returns
    input_mask: replace one bit of 1 with 0, meaning that the word will be masked
    mask_word_ids: the id of words that are masked
    '''
    batch_size = len(input_ids)
    
    new_input_mask = copy.deepcopy(input_mask)
    mask_word_ids = np.zeros((batch_size, vocab_size))
    for i in range(batch_size):
        total_word = sum(input_mask[i])
        mask_word = random.randint(0, total_word-1)
        
        assert new_input_mask[i][mask_word] == 1
        new_input_mask[i][mask_word] = 0
        mask_word_ids[i][input_ids[i][mask_word]] = 1.0
        
    return new_input_mask, tf.convert_to_tensor(mask_word_ids, dtype=tf.dtypes.int32)

In [10]:
input_masks, labels = make_rand_mask(input_ids_vals, input_mask_vals, len(tokenizer.vocab))

In [11]:
bert_inputs = [input_ids_vals, input_masks, segment_ids_vals]
result = model(bert_inputs)

In [12]:
opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
with tf.GradientTape() as tape:
    result = model(bert_inputs)
    loss = tf.nn.softmax_cross_entropy_with_logits(labels, result)
grads = tape.gradient(loss, model.trainable_weights)

In [13]:
assert len(grads) == len(model.trainable_weights)

In [14]:
opt.apply_gradients([(grads[i], model.trainable_weights[i]) for i in range(len(grads))])

<tf.Variable 'UnreadVariable' shape=() dtype=int64, numpy=1>

## Load data from ScratchGan dataset