In [7]:
from transformers import BertModel, BertTokenizer
import torch

In [10]:
# load the model
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
sentence = "I Love Mysore and i leave in Krishnarajanagar"

# step 1 : Tokenize the sentence

tokens = [x for x in sentence.lower().split()]
tokens

['i', 'love', 'mysore', 'and', 'i', 'leave', 'in', 'krishnarajanagar']

In [23]:
# step 2: Add [CLS] and [SEP] tokens:

tokens = ['[CLS]'] + tokens + ['[SEP]']
print(" Tokens are \n {} ".format(tokens))

 Tokens are 
 ['[CLS]', 'i', 'love', 'mysore', 'and', 'i', 'leave', 'in', 'krishnarajanagar', '[SEP]'] 


In [26]:
# Step 3: Padding the input:

T=15
padded_tokens=tokens +['[PAD]' for _ in range(T-len(tokens))]
print("Padded tokens are \n {} ".format(padded_tokens))

attn_mask=[ 1 if token != '[PAD]' else 0 for token in padded_tokens  ]
print("Attention Mask are \n {} ".format(attn_mask))

Padded tokens are 
 ['[CLS]', 'i', 'love', 'mysore', 'and', 'i', 'leave', 'in', 'krishnarajanagar', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'] 
Attention Mask are 
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0] 


In [28]:
# step 4: Maintain a list of segment tokens:
seg_ids=[0 for _ in range(len(padded_tokens))]

print("Segment Tokens are \n {}".format(seg_ids))

Segment Tokens are 
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [29]:
# step 5 : Obtaining indices of the tokens in BERT’s vocabulary:

sent_ids=tokenizer.convert_tokens_to_ids(padded_tokens)
print("senetence idexes \n {} ".format(sent_ids))

token_ids = torch.tensor(sent_ids).unsqueeze(0) 
attn_mask = torch.tensor(attn_mask).unsqueeze(0) 
seg_ids   = torch.tensor(seg_ids).unsqueeze(0)

senetence idexes 
 [101, 1045, 2293, 20761, 1998, 1045, 2681, 1999, 100, 102, 0, 0, 0, 0, 0] 


In [39]:
 model_output = model(token_ids, attention_mask = attn_mask,token_type_ids = seg_ids)

 hidden_reps, cls_head = model_output[0],model_output[1]

print(type(hidden_reps))
print(hidden_reps.shape ) #hidden states of each token in inout sequence 
print(cls_head.shape ) #hidden states of each [cls]

<class 'torch.Tensor'>
torch.Size([1, 15, 768])
torch.Size([1, 768])


In [None]:
'''
hidden_reps : # contextualized realationship between wprds in the sentence
clas_head :   # Senetnce Representation

hidden_rep.shape # torch.Size([1, 7, 768])

Our batch size is 1, the sequence length is the token length, since we have 7 tokens, the sequence length is 7,
The hidden size is the representation (embedding) size and it is 768 for the BERT-base model


'''

In [None]:
'''
We can obtain the representation of each token as:

hidden_rep[0][0] # gives the representation of the first token which is [CLS]
hidden_rep[0][1] gives the representation of the second token which is 'I'
hidden_repo[0][2] gives the representation of the third token which is 'love'

In this way, we can obtain the contextual representation of all the tokens.

This is basically the contextualized word embeddings of all the words in the given sentence.

'''

In [None]:
'''
let's take a look at the cls_head. It contains the representation of the [CLS] token. 

clas_head.shape # [1, 768]

cls_head holds the aggregate representation of the sentence, so we can use the 
cls_head as the representation of the given sentence 'I love Paris'.

'''

In [None]:
# Let's print the shape of cls_head :

print(clas_head.shape)

In [None]:
'''
We learned how to extract embeddings from the pre-trained BERT. B
But these are the embeddings obtained only from the topmost encoder layer of BERT which is encoder 12.

 Can we also extract the embeddings from all the encoder layers of BERT? 
     - Yes
'''