In [1]:
# Name of the BERT model to use
model_name = 'distilbert-base-uncased'

# Max length of the sentences
max_length = 120

In [2]:

# Huggingface transformers
from transformers import DistilBertConfig,  DistilBertTokenizer

# Tensorflow
import tensorflow as tf

In [3]:
# Load transformers config and set output_hidden_states to False
config = DistilBertConfig.from_pretrained(model_name)
config.output_hidden_states = False

config

DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.11.3",
  "vocab_size": 30522
}

In [4]:
# Load BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path = model_name, config = config)

tokenizer

PreTrainedTokenizer(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
# Load the model from disk
model = tf.keras.models.load_model('saved_model/model')

# Check its architecture
model.summary()

Model: "BERT_Sereniiti"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 120)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 120)]        0                                            
__________________________________________________________________________________________________
distilbert (Custom>TFDistilBert {'last_hidden_state' 66362880    input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
tf.__operators__.getitem (Slici (None, 768)          0           distilbert[0][0]    

In [6]:
# Test it on a sentence

sentence = "I love chocolate"

embedded_sentence = tokenizer(
    text=sentence,
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

embedded_sentence

{'input_ids': <tf.Tensor: shape=(1, 120), dtype=int32, numpy=
array([[ 101, 1045, 2293, 7967,  102,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0]])>, 'token_type_ids': <tf.Tensor: shape=(1, 120), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [7]:
model.call(embedded_sentence)

{'Y1': <tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[1.       , 1.       , 1.       , 0.9999989]], dtype=float32)>,
 'Y2': <tf.Tensor: shape=(1, 16), dtype=float32, numpy=
 array([[1.        , 0.99999785, 1.        , 0.9999987 , 0.99406767,
         0.99998116, 0.99995327, 0.05581928, 0.99999774, 0.999892  ,
         0.9998847 , 0.99999976, 0.99999964, 0.9958033 , 0.9999999 ,
         0.07062819]], dtype=float32)>,
 'Y3': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.96619314]], dtype=float32)>}

In [9]:
# Test it on a set of sentences

sentences = ["You are such a bad guy","I feel very good with you","Yesterday I was eating breakfast","There is not such thing as a miracle, you are just being foolled and being blind to it"]

embedded_sentences = tokenizer(
    text=sentences,
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

embedded_sentences

{'input_ids': <tf.Tensor: shape=(4, 120), dtype=int32, numpy=
array([[ 101, 2017, 2024, 2107, 1037, 2919, 3124,  102,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [ 101, 1045, 2514, 2200, 2204, 2007, 2017,  102,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,

In [10]:
model.call(embedded_sentences)

{'Y1': <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
 array([[1.       , 1.       , 0.9999267, 0.9999999],
        [1.       , 1.       , 1.       , 0.9999981],
        [1.       , 1.       , 1.       , 1.       ],
        [1.       , 1.       , 0.9999933, 1.       ]], dtype=float32)>,
 'Y2': <tf.Tensor: shape=(4, 16), dtype=float32, numpy=
 array([[0.9999968 , 0.99998856, 0.9987206 , 1.        , 0.9999994 ,
         0.989606  , 0.9999976 , 0.20016673, 0.9999951 , 0.9999331 ,
         1.        , 0.9999981 , 1.        , 0.99981457, 0.99995685,
         0.68332064],
        [1.        , 0.9999974 , 1.        , 0.99999964, 0.99797076,
         0.99997735, 0.99994516, 0.04472988, 0.9999957 , 0.99991024,
         0.9999733 , 1.        , 0.9999999 , 0.9984396 , 0.99999964,
         0.0304011 ],
        [0.9999999 , 0.9999958 , 0.9999999 , 0.99999964, 0.9999558 ,
         0.99938977, 0.99999785, 0.25262198, 0.9999995 , 0.99977225,
         0.999987  , 0.9999436 , 0.9999993 , 0.9942814 , 1.