# Loading BERT and experimenting

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
print("Using Tensorflow version: " + tf.__version__)
print(tf.config.list_physical_devices('GPU'))

BERT_DIR = "/home/aufish/Downloads/bert"

Using Tensorflow version: 2.1.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
# try with TF2 SavedModel
# The online downloading method does not work, use pre-downloaded module
# bert_module = hub.Module("https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/1")

bert_module = hub.load(BERT_DIR)

In [3]:
# tokenizer
from bert import tokenization

def create_tokenizer(vocab_file, do_lower_case=False):
    return tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer(BERT_DIR + "/assets/vocab.txt")

In [5]:
def convert_sentence_to_features(sentence, tokenizer, max_seq_len):
    tokens = ['[CLS]']
    tokens.extend(tokenizer.tokenize(sentence))
    if len(tokens) > max_seq_len-1:
        tokens = tokens[:max_seq_len-1]
    tokens.append('[SEP]')
    
    segment_ids = [0] * len(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    #Zero Mask till seq_length
    zero_mask = [0] * (max_seq_len-len(tokens))
    input_ids.extend(zero_mask)
    input_mask.extend(zero_mask)
    segment_ids.extend(zero_mask)
    
    return input_ids, input_mask, segment_ids

def convert_sentences_to_features(sentences, tokenizer, max_seq_len=20):
    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    
    for sentence in sentences:
        input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
        all_input_ids.append(input_ids)
        all_input_mask.append(input_mask)
        all_segment_ids.append(segment_ids)
    
    return all_input_ids, all_input_mask, all_segment_ids

In [17]:
sentences = ['I prefer Python over Java']
input_ids_vals, input_mask_vals, segment_ids_vals = convert_sentences_to_features(sentences, tokenizer, 20)

[101, 146, 9353, 23334, 1166, 9155, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
tf.Tensor(
[[-8.13168168e-01  5.37004828e-01  9.99965131e-01 -9.95936573e-01
   9.59787548e-01  7.93577373e-01  9.88739848e-01 -9.51556385e-01
  -9.80605185e-01 -6.68342054e-01  9.88626897e-01  9.99266505e-01
  -9.92237031e-01 -9.99913931e-01  5.71096301e-01 -9.82330680e-01
   9.94465053e-01 -6.70467496e-01 -9.99990642e-01 -5.95801175e-01
  -2.90958464e-01 -9.99944866e-01  3.65348727e-01  9.38748062e-01
   9.72101450e-01  7.80079979e-03  9.93025243e-01  9.99986172e-01
   9.13872898e-01 -4.82904315e-02  3.81492168e-01 -9.96081889e-01
   6.69135928e-01 -9.99616861e-01  3.23957533e-01 -7.37068057e-02
   6.31336629e-01 -4.18109655e-01  6.25919342e-01 -9.02191043e-01
  -7.32699156e-01 -7.56190777e-01  4.69528168e-01 -6.72841430e-0

In [19]:
# all 1 in mask
bert_inputs = [input_ids_vals, input_mask_vals, segment_ids_vals]

print(input_ids_vals)
print(input_mask_vals)
print(segment_ids_vals)

out = bert_module (bert_inputs)

print(out[0])

[[101, 146, 9353, 23334, 1166, 9155, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
tf.Tensor(
[[-8.13168168e-01  5.37004828e-01  9.99965131e-01 -9.95936573e-01
   9.59787548e-01  7.93577373e-01  9.88739848e-01 -9.51556385e-01
  -9.80605185e-01 -6.68342054e-01  9.88626897e-01  9.99266505e-01
  -9.92237031e-01 -9.99913931e-01  5.71096301e-01 -9.82330680e-01
   9.94465053e-01 -6.70467496e-01 -9.99990642e-01 -5.95801175e-01
  -2.90958464e-01 -9.99944866e-01  3.65348727e-01  9.38748062e-01
   9.72101450e-01  7.80079979e-03  9.93025243e-01  9.99986172e-01
   9.13872898e-01 -4.82904315e-02  3.81492168e-01 -9.96081889e-01
   6.69135928e-01 -9.99616861e-01  3.23957533e-01 -7.37068057e-02
   6.31336629e-01 -4.18109655e-01  6.25919342e-01 -9.02191043e-01
  -7.32699156e-01 -7.56190777e-01  4.69528168e-01 -6.72841

In [23]:
# change value for mask of one word
import copy

input_mask_val_2 = copy.deepcopy(input_mask_vals)
input_mask_val_2[0][0] = 0


bert_inputs = [input_ids_vals, input_mask_val_2, segment_ids_vals]

print(input_ids_vals)
print(input_mask_val_2)
print(segment_ids_vals)

out = bert_module (bert_inputs)

print(out[0])

[[101, 146, 9353, 23334, 1166, 9155, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
tf.Tensor(
[[-0.7495881   0.42150226  0.9999357  -0.9956113   0.9561252   0.83232427
   0.9884643  -0.9753574  -0.9746838  -0.68136847  0.98769104  0.99920106
  -0.9962359  -0.9998708   0.78728586 -0.97859603  0.99461234 -0.6115519
  -0.9999842  -0.7668021  -0.53842455 -0.9998997   0.31181708  0.9602419
   0.97317487  0.00873318  0.9920776   0.9999777   0.9132364  -0.18665978
   0.34439304 -0.9949617   0.7738303  -0.99948895  0.22586994  0.00405888
   0.7183089  -0.33285704  0.7466531  -0.9337601  -0.670318   -0.86028147
   0.56638736 -0.63287514  0.70299566  0.35989898  0.05696377 -0.03103986
  -0.19797339  0.9999039  -0.96726674  0.99982995 -0.9890568   0.99865496
   0.9974677   0.43166074  0.9970056   0.22132374 -0.99426854  0.28336212
   0.96873635  0.08246753  0.91814315 -0.18

In [12]:
# This is using graph execution instead of eager execution
# graph execution sounds like functional-programming related idea
BERT_DIR = "/home/aufish/Downloads/bert"

with tf.compat.v1.Session() as sess:
    sess.run(tf.compat.v1.global_variables_initializer())
    
    input_ids = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, None])
    input_mask = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, None])
    segment_ids = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, None])

#     bert_inputs = dict(
#         input_ids=input_ids,
#         input_mask=input_mask,
#         segment_ids=segment_ids)
    
    bert_inputs = [input_ids, input_mask, segment_ids]

    bert_outputs = bert_module(bert_inputs, signature="tokens", as_dict=True)

ValueError: Could not find matching function to call loaded from the SavedModel. Got:
  Positional arguments (3 total):
    * [<tf.Tensor 'inputs:0' shape=(None, None) dtype=int32>, <tf.Tensor 'inputs_1:0' shape=(None, None) dtype=int32>, <tf.Tensor 'inputs_2:0' shape=(None, None) dtype=int32>]
    * False
    * None
  Keyword arguments: {'signature': 'tokens', 'as_dict': True}

Expected these arguments to match one of the following 4 option(s):

Option 1:
  Positional arguments (3 total):
    * [TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/0'), TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/1'), TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/2')]
    * True
    * None
  Keyword arguments: {}

Option 2:
  Positional arguments (3 total):
    * [TensorSpec(shape=(None, None), dtype=tf.int32, name='input_word_ids'), TensorSpec(shape=(None, None), dtype=tf.int32, name='input_mask'), TensorSpec(shape=(None, None), dtype=tf.int32, name='input_type_ids')]
    * False
    * None
  Keyword arguments: {}

Option 3:
  Positional arguments (3 total):
    * [TensorSpec(shape=(None, None), dtype=tf.int32, name='input_word_ids'), TensorSpec(shape=(None, None), dtype=tf.int32, name='input_mask'), TensorSpec(shape=(None, None), dtype=tf.int32, name='input_type_ids')]
    * True
    * None
  Keyword arguments: {}

Option 4:
  Positional arguments (3 total):
    * [TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/0'), TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/1'), TensorSpec(shape=(None, None), dtype=tf.int32, name='inputs/2')]
    * False
    * None
  Keyword arguments: {}

## Train BERT for Masked-word Predition

In [45]:
import random, copy
import numpy as np
def make_rand_mask(input_ids, input_mask):
    ''' 
    input_ids: the ids of words in the sentences
    input_mask: initial mask (1 if there is a word; 0 for padding)
    returns
    input_mask: replace one bit of 1 with 0, meaning that the word will be masked
    mask_word_ids: the id of words that are masked
    '''
    batch_size = len(input_ids)
    
    new_input_mask = copy.deepcopy(input_mask)
    mask_word_ids = np.zeros((batch_size, 1))
    for i in range(batch_size):
        total_word = sum(input_mask[i])
        mask_word = random.randint(0, total_word-1)
        
        assert new_input_mask[i][mask_word] == 1
        new_input_mask[i][mask_word] = 0
        mask_word_ids[i] = input_ids[i][mask_word]
        
    return new_input_mask, tf.convert_to_tensor(mask_word_ids, dtype=tf.dtypes.int32)

In [46]:
make_rand_mask(input_ids_vals, input_mask_vals)

([[0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 <tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[101]], dtype=int32)>)