In [1]:
import tensorflow as tf
import keras 
import tensorflow_hub as hub # to get our bert model
import tensorflow_text as text

In [2]:
preprocess_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3" # for preprocessing
encoder_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4" # for bert model

In [3]:
bert_preprocess_model = hub.KerasLayer(preprocess_url) # build preprocessing layer

In [4]:
text_test = ['nice movie indeed','I love python programming']
text_preprocessed = bert_preprocess_model(text_test) # we preprocess the text using the preprocessing layer (produces a dictionary)
text_preprocessed.keys()

dict_keys(['input_mask', 'input_type_ids', 'input_word_ids'])

In [5]:
text_preprocessed['input_mask']
# we add [cls] token as the start of each sentence 
# and [sep] token at the end, so the first sentence consists of 5 words and the second consists of 6 words

<tf.Tensor: shape=(2, 128), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>

In [6]:
text_preprocessed['input_type_ids'] 
# shape is (2,128) which means we have 2 sentences each of 128 words length

<tf.Tensor: shape=(2, 128), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>

In [7]:
text_preprocessed['input_word_ids']
# explains the id of each word in each sentence, 101--> CLS, 102--> SEP

<tf.Tensor: shape=(2, 128), dtype=int32, numpy=
array([[  101,  3835,  3185,  5262,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [8]:
# now, we will build the bert model layer
bert_model = hub.KerasLayer(encoder_url) # will take some time ~ 5 mins

In [9]:
bert_results = bert_model(text_preprocessed) # apply the model on the text_preprocessed dictionary created (produces a dictionary)
bert_results.keys()

dict_keys(['default', 'encoder_outputs', 'pooled_output', 'sequence_output'])

In [10]:
bert_results['pooled_output']
# the pooled output is the embeddings for the entire sentence so we have 2 vectors for our 2 sentences each of length 768

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.791774  , -0.21411917,  0.49769533, ...,  0.24465217,
        -0.47334474,  0.8175868 ],
       [-0.9171231 , -0.47935176, -0.7865696 , ..., -0.61751723,
        -0.7102685 ,  0.92184293]], dtype=float32)>

In [11]:
bert_results['sequence_output'] 
# the sequence output is the individual word embeddings so the shape is (2, 128, 768) which means we have 2 sentences each 
# one has 128 words and each word has an embedding of length 768

<tf.Tensor: shape=(2, 128, 768), dtype=float32, numpy=
array([[[ 0.07292064,  0.08567819,  0.14476836, ..., -0.09677088,
          0.08722144,  0.07711092],
        [ 0.17839417, -0.19006042,  0.50349486, ..., -0.05869795,
          0.32717082, -0.15578541],
        [ 0.18701479, -0.43388715, -0.48875144, ..., -0.15502736,
          0.00145109, -0.2447098 ],
        ...,
        [ 0.12083037,  0.12884255,  0.4645356 , ...,  0.07375544,
          0.17441946,  0.16522126],
        [ 0.07967836, -0.01190655,  0.5022546 , ...,  0.13777718,
          0.21002199,  0.00624621],
        [-0.072127  , -0.28303406,  0.5903339 , ...,  0.47551882,
          0.16668493, -0.08920337]],

       [[-0.07900581,  0.36335114, -0.21101557, ..., -0.17183751,
          0.16299753,  0.67242676],
        [ 0.2788351 ,  0.43716332, -0.3576475 , ..., -0.04463701,
          0.3831518 ,  0.58879906],
        [ 1.2037671 ,  1.0727016 ,  0.48408777, ...,  0.24920999,
          0.4073099 ,  0.40481806],
        ...,

In [13]:
print(len(bert_results['encoder_outputs'])) # we use bert base which have 12 encoders. each layer has 768 size embedding vector
bert_results['encoder_outputs'] 
# each layer has output of (2, 128, 768) embeddings for all words
# the output of the last (final) encoder is the same as our sequence_output (final words embeddings)

12


[<tf.Tensor: shape=(2, 128, 768), dtype=float32, numpy=
 array([[[ 0.12901425,  0.00644747, -0.0361497 , ...,  0.04999633,
           0.06149192, -0.02657545],
         [ 1.1753384 ,  1.2140784 ,  1.1569979 , ...,  0.11634396,
          -0.35855335, -0.40490183],
         [ 0.03859033,  0.5386997 , -0.21089774, ...,  0.21858189,
           0.72601664, -1.1158603 ],
         ...,
         [-0.07587016, -0.254219  ,  0.7075511 , ...,  0.50541997,
          -0.1887868 ,  0.15028326],
         [-0.16066611, -0.28089687,  0.57597065, ...,  0.52758545,
          -0.11141388,  0.02887545],
         [-0.04428154, -0.20279586,  0.5909355 , ...,  0.8133835 ,
          -0.39075807, -0.02601737]],
 
        [[ 0.18903585,  0.02752546, -0.06513744, ..., -0.00620212,
           0.15053892,  0.03165444],
         [ 0.5916151 ,  0.7589137 , -0.07240665, ...,  0.6190394 ,
           0.8292891 ,  0.16161951],
         [ 1.4460827 ,  0.44602644,  0.4099025 , ...,  0.48255914,
           0.62691146,  0.13

In [14]:
bert_results['encoder_outputs'][-1] == bert_results['sequence_output'] # proving the last encoder output is our final words embeddings

<tf.Tensor: shape=(2, 128, 768), dtype=bool, numpy=
array([[[ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        ...,
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]],

       [[ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        ...,
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]]])>