In [1]:
# install huggingface Transformers [https://huggingface.co/transformers/installation.html]

# Many transformer based models in a single library: https://github.com/huggingface/transformers#model-architectures
! pip install transformers




In [2]:
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.
2.14.0


# Tokenization

In [8]:

# simple example
s = "machine learning model..."
words = s.split(" ")  # Split over space
vocabulary = dict(enumerate(sorted(set(words))))  # Map storing the word to it's corresponding id

print(vocabulary)

# Problems: cat(1123) vs cats(1346)

{0: 'learning', 1: 'machine', 2: 'model...'}


# Tokenization in huggingface
**bold text**

In [9]:
from transformers import BertTokenizer
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
print(bert_tokenizer.cls_token)

[CLS]


In [11]:
enc = bert_tokenizer.encode("Hi, I am James bond !")
print(enc)

print(bert_tokenizer.decode(enc))

[101, 8790, 117, 146, 1821, 1600, 7069, 106, 102]
[CLS] Hi, I am James bond! [SEP]


In [12]:
print(bert_tokenizer.decode([117]))
print(bert_tokenizer.decode([106]))

,
!


In [13]:
enc = bert_tokenizer.encode("I see many cats and dogs")
print(enc)

print(bert_tokenizer.decode(enc))

[101, 146, 1267, 1242, 11771, 1105, 6363, 102]
[CLS] I see many cats and dogs [SEP]


# **BERT Models**


*   DistilBert



In [14]:
import tensorflow as tf

from transformers import DistilBertTokenizer, TFDistilBertModel

distil_bert = 'distilbert-base-uncased' # Name of the pretrained models

#DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained(distil_bert)
model = TFDistilBertModel.from_pretrained(distil_bert)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


# Extract features using BERT

In [15]:
# obtain the 768-dim vector correpsoding to [CLS] which is a sentence vector

e = tokenizer.encode("Hello, my dog is cute")
print(e)

input = tf.constant(e)[None, :]  # Batch size 1
print(input)
print(type(input)) # shape: [1,8]

output = model(input)

print(type(output))
print(len(output))
print(output) #shape[1,8,768]


[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]
tf.Tensor([[  101  7592  1010  2026  3899  2003 10140   102]], shape=(1, 8), dtype=int32)
<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'transformers.modeling_tf_outputs.TFBaseModelOutput'>
1
TFBaseModelOutput(last_hidden_state=<tf.Tensor: shape=(1, 8, 768), dtype=float32, numpy=
array([[[-1.8296391e-01, -7.4054100e-02,  5.0267510e-02, ...,
         -1.1260703e-01,  4.4493085e-01,  4.0941304e-01],
        [ 7.0612691e-04,  1.4825360e-01,  3.4328312e-01, ...,
         -8.6039729e-02,  6.9474769e-01,  4.3352760e-02],
        [-5.0720561e-01,  5.3085506e-01,  3.7162617e-01, ...,
         -5.6287444e-01,  1.3755682e-01,  2.8475249e-01],
        ...,
        [-4.2251363e-01,  5.7314865e-02,  2.4338329e-01, ...,
         -1.5222672e-01,  2.4462420e-01,  6.4154822e-01],
        [-4.9384448e-01, -1.8895446e-01,  1.2640814e-01, ...,
          6.3240394e-02,  3.6912829e-01, -5.8252428e-02],
        [ 8.3268678e-01,  2.4948236e-01, -4

In [16]:
#[CLS] corresponding vector
print((output[0])[0,0,:])  # shape: 768 dim vector

tf.Tensor(
[-1.82963908e-01 -7.40540996e-02  5.02675101e-02 -3.49530518e-01
 -7.28533790e-02 -2.63872474e-01  2.39293426e-01  4.79842275e-01
 -2.14802533e-01 -1.89516395e-01  8.99827033e-02 -1.29188865e-01
 -1.11275911e-01  3.16634417e-01 -8.25903565e-02  9.26225707e-02
 -2.09083334e-02  4.74876046e-01  1.28833815e-01  3.18706594e-03
 -1.53505504e-01 -3.57001662e-01  9.89256427e-04 -3.92749812e-03
  1.38444398e-02 -5.49409389e-02  8.45262557e-02  1.36564448e-01
  2.18252271e-01 -1.96798801e-01  2.47995201e-02  1.75569549e-01
 -3.97215784e-02 -1.10777073e-01  5.48525415e-02  6.07528873e-02
  1.72001235e-02 -1.07415199e-01 -8.76945555e-02  2.12042108e-01
 -4.05891836e-02 -3.17960083e-02  1.37657285e-01 -1.39004812e-01
 -4.68854606e-03 -3.97633433e-01 -2.60034633e+00 -1.08741574e-01
  4.86708619e-02 -3.61387551e-01  3.71814370e-01 -7.61097521e-02
  3.23912278e-02  2.31666282e-01  2.63016105e-01  3.18299860e-01
 -3.87970775e-01  2.98111171e-01 -4.93027978e-02 -3.59302983e-02
  1.58540636e-

In [17]:
# How about hidden layer outputs

#https://huggingface.co/transformers/model_doc/distilbert.html#distilbertconfig
from transformers import  DistilBertConfig

config = DistilBertConfig.from_pretrained(distil_bert, output_hidden_states=True)


e = tokenizer.encode("Hello, my dog is cute")
input = tf.constant(e)[None, :]  # Batch size 1
model = TFDistilBertModel.from_pretrained(distil_bert, config=config)
print(model.config) # Every model has a config file



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.35.2",
  "vocab_size": 30522
}



In [18]:
output = model(input)
print(len(output))

2


In [19]:
print(output[0])

tf.Tensor(
[[[-1.8296391e-01 -7.4054100e-02  5.0267510e-02 ... -1.1260703e-01
    4.4493085e-01  4.0941304e-01]
  [ 7.0612691e-04  1.4825360e-01  3.4328312e-01 ... -8.6039729e-02
    6.9474769e-01  4.3352760e-02]
  [-5.0720561e-01  5.3085506e-01  3.7162617e-01 ... -5.6287444e-01
    1.3755682e-01  2.8475249e-01]
  ...
  [-4.2251363e-01  5.7314865e-02  2.4338329e-01 ... -1.5222672e-01
    2.4462420e-01  6.4154822e-01]
  [-4.9384448e-01 -1.8895446e-01  1.2640814e-01 ...  6.3240394e-02
    3.6912829e-01 -5.8252428e-02]
  [ 8.3268678e-01  2.4948236e-01 -4.5439535e-01 ...  1.1997570e-01
   -3.9257339e-01 -2.7785373e-01]]], shape=(1, 8, 768), dtype=float32)


In [20]:
output[0].shape

TensorShape([1, 8, 768])

In [21]:
output[1][0].shape

TensorShape([1, 8, 768])

In [22]:
print(type(output[1]))
print(len(output[1])) # 7
print(output[1][6]) # Shape:(1,8,768)

<class 'tuple'>
7
tf.Tensor(
[[[-1.8296391e-01 -7.4054100e-02  5.0267510e-02 ... -1.1260703e-01
    4.4493085e-01  4.0941304e-01]
  [ 7.0612691e-04  1.4825360e-01  3.4328312e-01 ... -8.6039729e-02
    6.9474769e-01  4.3352760e-02]
  [-5.0720561e-01  5.3085506e-01  3.7162617e-01 ... -5.6287444e-01
    1.3755682e-01  2.8475249e-01]
  ...
  [-4.2251363e-01  5.7314865e-02  2.4338329e-01 ... -1.5222672e-01
    2.4462420e-01  6.4154822e-01]
  [-4.9384448e-01 -1.8895446e-01  1.2640814e-01 ...  6.3240394e-02
    3.6912829e-01 -5.8252428e-02]
  [ 8.3268678e-01  2.4948236e-01 -4.5439535e-01 ...  1.1997570e-01
   -3.9257339e-01 -2.7785373e-01]]], shape=(1, 8, 768), dtype=float32)


 **Same steps as above, for any Transformer /BERT like model**