In [1]:
# install huggingface Transformers [https://huggingface.co/transformers/installation.html]

# Many transformer based models in a single library: https://github.com/huggingface/transformers#model-architectures
! pip install transformers

# This week: we will use HuggingFace BERT implementations.
# Next sessions: Build an encoder-decoder seq-seq Transfomer from scratch using TF/Keras.

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m41.5 MB/s[0m eta [36m0:00:0

In [2]:
# Reference: https://medium.com/tensorflow/using-tensorflow-2-for-state-of-the-art-natural-language-processing-102445cda54a
# Ref: https://huggingface.co/transformers/notebooks.html

In [3]:
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.
2.12.0


## Tokenization

In [4]:
# Tokenization: map words to ids
# Refer: https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/01-training-tokenizers.ipynb#scrollTo=LgktNYt7ADPS

# simple example
s = "very long corpus..."
words = s.split(" ")  # Split over space
vocabulary = dict(enumerate(set(words)))  # Map storing the word to it's corresponding id

print(vocabulary)

# Problems: cat(1123) vs cats(1346)

{0: 'long', 1: 'very', 2: 'corpus...'}


### Sub-tokenization

- Why? : fast vs faster, cat vs cats
- example: cats --**bold text**> [cat, ##s]
- Image: https://nlp.fast.ai/images/multifit_vocabularies.png

<img src="https://nlp.fast.ai/images/multifit_vocabularies.png" alt="Smiley face" height="75%" width="75%">


### Tokenization in huggingface
**bold text**

In [5]:
from transformers import BertTokenizer
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
# Refer BERT architecture from the previous videos in the course.

#https://huggingface.co/transformers/main_classes/tokenizer.html
print(bert_tokenizer.cls_token)

[CLS]


In [7]:
enc = bert_tokenizer.encode("Hi, I am James bond !")
print(enc)

print(bert_tokenizer.decode(enc))

[101, 8790, 117, 146, 1821, 1600, 7069, 106, 102]
[CLS] Hi, I am James bond! [SEP]


In [8]:
print(bert_tokenizer.decode([117]))
print(bert_tokenizer.decode([106]))

,
!


In [9]:
enc = bert_tokenizer.encode("I see many cats and dogs")
print(enc)

print(bert_tokenizer.decode(enc))

[101, 146, 1267, 1242, 11771, 1105, 6363, 102]
[CLS] I see many cats and dogs [SEP]


## BERT Models
- DistillBERT
- RoBERTa
- https://miro.medium.com/max/2000/1*IFVX74cEe8U5D1GveL1uZA.png
<img src="https://miro.medium.com/max/2000/1*IFVX74cEe8U5D1GveL1uZA.png " alt="Smiley face" height="75%" width="75%">

- https://miro.medium.com/max/1400/1*bSUO_Qib4te1xQmBlQjWaw.png
<img src="https://miro.medium.com/max/1400/1*bSUO_Qib4te1xQmBlQjWaw.png " alt="Smiley face" height="75%" width="75%">

- General Language Understanding Evaluation (GLUE)  : https://gluebenchmark.com/


In [10]:
import tensorflow as tf

# Refer: https://huggingface.co/transformers/model_doc/distilbert.html#

from transformers import DistilBertTokenizer, TFDistilBertModel

distil_bert = 'distilbert-base-uncased' # Name of the pretrained models

#DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained(distil_bert)
model = TFDistilBertModel.from_pretrained(distil_bert)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


### Extract features using BERT

In [11]:
# obtain the 768-dim vector correpsoding to [CLS] which is a sentence vector

e = tokenizer.encode("Hello, my dog is cute")
print(e)

input = tf.constant(e)[None, :]  # Batch size 1
print(input)
print(type(input)) # shape: [1,8]

output = model(input)

print(type(output))
print(len(output))
print(output) #shape[1,8,768]

[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]
tf.Tensor([[  101  7592  1010  2026  3899  2003 10140   102]], shape=(1, 8), dtype=int32)
<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'transformers.modeling_tf_outputs.TFBaseModelOutput'>
1
TFBaseModelOutput(last_hidden_state=<tf.Tensor: shape=(1, 8, 768), dtype=float32, numpy=
array([[[-1.8296382e-01, -7.4054182e-02,  5.0267622e-02, ...,
         -1.1260702e-01,  4.4493088e-01,  4.0941316e-01],
        [ 7.0649944e-04,  1.4825346e-01,  3.4328276e-01, ...,
         -8.6039737e-02,  6.9474745e-01,  4.3352664e-02],
        [-5.0720561e-01,  5.3085524e-01,  3.7162632e-01, ...,
         -5.6287485e-01,  1.3755700e-01,  2.8475225e-01],
        ...,
        [-4.2251298e-01,  5.7314806e-02,  2.4338327e-01, ...,
         -1.5222709e-01,  2.4462473e-01,  6.4154845e-01],
        [-4.9384439e-01, -1.8895467e-01,  1.2640776e-01, ...,
          6.3240372e-02,  3.6912853e-01, -5.8251858e-02],
        [ 8.3268678e-01,  2.4948205e-01, -4

In [15]:
#[CLS] corresponding vector
print((output[0])[0,0,:])  # shape: 768 dim vector

tf.Tensor(
[-1.82963818e-01 -7.40541816e-02  5.02676219e-02 -3.49530548e-01
 -7.28534237e-02 -2.63872594e-01  2.39293426e-01  4.79841977e-01
 -2.14802399e-01 -1.89516395e-01  8.99827331e-02 -1.29188925e-01
 -1.11275904e-01  3.16634476e-01 -8.25904980e-02  9.26223472e-02
 -2.09082663e-02  4.74876285e-01  1.28833607e-01  3.18705849e-03
 -1.53505579e-01 -3.57002020e-01  9.89440829e-04 -3.92741710e-03
  1.38443653e-02 -5.49407899e-02  8.45261216e-02  1.36564314e-01
  2.18252182e-01 -1.96798742e-01  2.47996002e-02  1.75569162e-01
 -3.97216827e-02 -1.10777177e-01  5.48523962e-02  6.07529357e-02
  1.71999857e-02 -1.07415296e-01 -8.76945630e-02  2.12041959e-01
 -4.05892432e-02 -3.17956284e-02  1.37657210e-01 -1.39004573e-01
 -4.68880683e-03 -3.97633344e-01 -2.60034609e+00 -1.08741626e-01
  4.86708581e-02 -3.61387730e-01  3.71814281e-01 -7.61095956e-02
  3.23911943e-02  2.31666535e-01  2.63016015e-01  3.18299532e-01
 -3.87970865e-01  2.98110932e-01 -4.93030883e-02 -3.59302163e-02
  1.58540606e-

In [16]:
# How about hidden layer outputs

#https://huggingface.co/transformers/model_doc/distilbert.html#distilbertconfig
from transformers import  DistilBertConfig

config = DistilBertConfig.from_pretrained(distil_bert, output_hidden_states=True)


e = tokenizer.encode("Hello, my dog is cute")
input = tf.constant(e)[None, :]  # Batch size 1
model = TFDistilBertModel.from_pretrained(distil_bert, config=config)
print(model.config) # Every model has a config file

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.31.0",
  "vocab_size": 30522
}



Since we have set the show hidden states = true therefore we will get the hidden states in the first index of the output.
We can access the ith hidden layer through output[0][i] and if we want to access the first word of the first sentence we can use the output[0][i][0,0,:].

- 0th sentence
- 0th word
- : all values

In [17]:
output = model(input)
print(len(output))

2


In [18]:
print(output[0])

tf.Tensor(
[[[-1.8296382e-01 -7.4054182e-02  5.0267622e-02 ... -1.1260702e-01
    4.4493088e-01  4.0941316e-01]
  [ 7.0649944e-04  1.4825346e-01  3.4328276e-01 ... -8.6039737e-02
    6.9474745e-01  4.3352664e-02]
  [-5.0720561e-01  5.3085524e-01  3.7162632e-01 ... -5.6287485e-01
    1.3755700e-01  2.8475225e-01]
  ...
  [-4.2251298e-01  5.7314806e-02  2.4338327e-01 ... -1.5222709e-01
    2.4462473e-01  6.4154845e-01]
  [-4.9384439e-01 -1.8895467e-01  1.2640776e-01 ...  6.3240372e-02
    3.6912853e-01 -5.8251858e-02]
  [ 8.3268678e-01  2.4948205e-01 -4.5439535e-01 ...  1.1997542e-01
   -3.9257339e-01 -2.7785397e-01]]], shape=(1, 8, 768), dtype=float32)


In [19]:
output[0].shape

TensorShape([1, 8, 768])

In [20]:
output[1][0].shape

TensorShape([1, 8, 768])

In [21]:
print(type(output[1]))
print(len(output[1])) # 7 Why?
#len(output[1]) will return 7. This is because the output of the model is a sequence of 8 hidden states, but the first hidden state is reserved for the CLS token, which is a special token that is used to represent the entire input sequence. Therefore, the remaining 7 hidden states correspond to the 7 tokens in the input sequence.
print(output[1][6]) # Shape:(1,8,768)

<class 'tuple'>
7
tf.Tensor(
[[[-1.8296382e-01 -7.4054182e-02  5.0267622e-02 ... -1.1260702e-01
    4.4493088e-01  4.0941316e-01]
  [ 7.0649944e-04  1.4825346e-01  3.4328276e-01 ... -8.6039737e-02
    6.9474745e-01  4.3352664e-02]
  [-5.0720561e-01  5.3085524e-01  3.7162632e-01 ... -5.6287485e-01
    1.3755700e-01  2.8475225e-01]
  ...
  [-4.2251298e-01  5.7314806e-02  2.4338327e-01 ... -1.5222709e-01
    2.4462473e-01  6.4154845e-01]
  [-4.9384439e-01 -1.8895467e-01  1.2640776e-01 ...  6.3240372e-02
    3.6912853e-01 -5.8251858e-02]
  [ 8.3268678e-01  2.4948205e-01 -4.5439535e-01 ...  1.1997542e-01
   -3.9257339e-01 -2.7785397e-01]]], shape=(1, 8, 768), dtype=float32)


 **Same steps as above, for any Transformer /BERT like model**

### Fine-tuning for various tasks

- Refer: https://arxiv.org/pdf/1810.04805.pdf

-Next video