#Using BERT for Word Embeddings

In [1]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize input sentence and generate embeddings
sentence = "I went to the bank to deposit money"
inputs = tokenizer(sentence, return_tensors="pt")
outputs = model(**inputs)

# The last hidden state contains the contextual embeddings for each word
embeddings = outputs.last_hidden_state
print(embeddings.shape)  # [batch_size, sequence_length, hidden_size]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

torch.Size([1, 10, 768])


#Extract Specific Word Embeddings

In [2]:
word_index = sentence.split().index("bank")  # Find the index of the word "bank"
word_embedding = embeddings[0, word_index]   # Get the embedding for "bank"
print(f"Embedding for 'bank': {word_embedding}")

Embedding for 'bank': tensor([ 3.9621e-01, -6.2730e-01, -3.0910e-01,  8.2134e-01,  4.7180e-01,
         6.9128e-02,  2.8338e-01,  6.2293e-01,  2.2931e-01, -7.0774e-01,
         3.9415e-01,  4.4497e-02, -3.4668e-01,  6.8605e-01, -1.6213e-01,
        -1.9277e-01,  9.2448e-01, -4.7359e-01,  1.5116e-01, -2.0374e-01,
        -2.1745e-01, -6.6374e-03,  3.3649e-01,  5.1218e-01,  5.6537e-01,
         3.2563e-01, -3.5499e-01,  6.7122e-01, -6.0395e-01, -5.1964e-01,
         2.8966e-01, -2.2991e-01, -1.2626e+00, -3.6478e-01, -1.7716e-01,
         1.9846e-01,  3.3150e-02, -4.1250e-01, -8.7178e-01,  4.2324e-01,
        -6.5352e-01, -2.6243e-01,  2.1568e-01, -5.4737e-01,  3.8929e-01,
        -3.0192e-01,  9.4213e-01, -5.2204e-01, -5.1965e-01, -9.4459e-01,
        -5.0051e-01, -2.1979e-02, -1.3667e-01,  4.8488e-02, -1.9403e-01,
         1.1578e+00,  2.7088e-01, -2.8382e-01, -7.0100e-01, -8.3562e-02,
        -7.2506e-02, -6.9357e-01,  9.5272e-01,  1.3999e-01,  1.9123e-01,
         5.9152e-02,  9.7344e

#CLS Token for Sentence Representation

In [3]:
# Extract the CLS token embedding (first token of the sequence)
cls_embedding = embeddings[0, 0]  # The first token is [CLS]
print(f"CLS token embedding (for the whole sentence): {cls_embedding}")

CLS token embedding (for the whole sentence): tensor([ 2.8457e-01,  3.7847e-01, -1.0055e-01,  2.4363e-02,  3.9028e-03,
         1.8755e-01,  1.5924e-01,  3.8033e-01,  2.3080e-01, -2.4598e-01,
         7.8690e-02,  1.0405e-01,  1.2725e-01,  1.4036e-01,  8.4448e-02,
        -2.6664e-01,  2.5248e-02,  3.0605e-01,  1.2317e-01, -2.7540e-01,
        -9.5291e-02, -9.7537e-02,  2.5627e-03,  2.5437e-01,  2.2590e-02,
         5.5341e-02, -3.7632e-01,  1.4905e-02, -5.9932e-02,  1.1040e-01,
         2.7499e-01,  2.4288e-02, -1.8333e-01,  1.1836e-01, -4.1604e-03,
        -3.4382e-01,  3.8609e-01, -3.0965e-01, -1.9044e-01, -1.2243e-01,
        -2.7589e-01,  2.7016e-01,  9.6298e-02,  1.8258e-01,  2.2397e-01,
        -2.4444e-01, -2.2671e+00, -2.3820e-01, -1.2636e-01, -4.3239e-01,
         7.1430e-02, -1.6420e-01,  8.9329e-02,  4.4714e-01,  2.1396e-01,
         2.6775e-01, -3.1124e-01,  4.8643e-01,  1.0447e-01,  1.7978e-01,
         5.8454e-01, -5.9180e-03,  6.4388e-02,  3.3939e-01,  1.1720e-01,
     

#Layer-wise Embedding Extraction

In [7]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize input sentence and generate embeddings
sentence = "I went to the bank to deposit money"
inputs = tokenizer(sentence, return_tensors="pt")

# **Change**: Pass `output_hidden_states=True` to the model's forward pass
outputs = model(**inputs, output_hidden_states=True)

# The last hidden state contains the contextual embeddings for each word
embeddings = outputs.last_hidden_state
print(embeddings.shape)  # [batch_size, sequence_length, hidden_size]

# Get embeddings from all layers (12 layers in base BERT model)
all_layer_embeddings = outputs.hidden_states  # `hidden_states` gives embeddings for each layer
print(f"Number of layers: {len(all_layer_embeddings)}")

# You can extract the embedding from the last, middle, or first layer
last_layer_embedding = all_layer_embeddings[-1]  # Embeddings from the last layer
middle_layer_embedding = all_layer_embeddings[len(all_layer_embeddings)//2]  # Middle layer
first_layer_embedding = all_layer_embeddings[0]  # First layer

print(f"Embedding from last layer: {last_layer_embedding}")
print(f"Embedding from middle layer: {middle_layer_embedding}")
print(f"Embedding from first layer: {first_layer_embedding}")



torch.Size([1, 10, 768])
Number of layers: 13
Embedding from last layer: tensor([[[ 0.2846,  0.3785, -0.1006,  ...,  0.1152,  0.0543,  0.4230],
         [ 0.6447, -0.2538, -0.2583,  ..., -0.3467,  0.4817, -0.0888],
         [ 0.4446, -0.6874,  0.0461,  ..., -0.0597, -0.5083,  0.4963],
         ...,
         [ 0.6191, -0.0664,  1.2264,  ...,  0.2036, -0.8543,  0.0119],
         [ 0.4405, -0.3249, -0.2979,  ...,  0.8151,  0.0759, -0.0836],
         [ 0.7993,  0.2668, -0.2364,  ...,  0.0467, -0.5778, -0.3592]]],
       grad_fn=<NativeLayerNormBackward0>)
Embedding from middle layer: tensor([[[-8.9841e-02, -1.9727e-01, -2.0159e-01,  ..., -6.9719e-02,
           4.7862e-02,  4.4728e-01],
         [ 1.2190e+00,  1.8484e-02, -4.4327e-01,  ...,  1.0007e-01,
          -8.6046e-02, -2.1791e-02],
         [ 5.4395e-02, -4.7916e-01, -4.8046e-02,  ...,  4.1783e-01,
          -1.2044e+00,  2.3808e-01],
         ...,
         [ 1.4475e+00,  4.4810e-01,  1.1346e+00,  ...,  5.4666e-01,
           2.687

#Cosine Similarity Between Word Embeddings

In [5]:
from torch.nn.functional import cosine_similarity

# Extract embeddings for "bank" and "money" and compute cosine similarity
bank_embedding = embeddings[0, sentence.split().index("bank")]
money_embedding = embeddings[0, sentence.split().index("money")]

similarity = cosine_similarity(bank_embedding.unsqueeze(0), money_embedding.unsqueeze(0))
print(f"Cosine similarity between 'bank' and 'money': {similarity.item()}")

Cosine similarity between 'bank' and 'money': 0.4556240439414978
