In [1]:
!pip install transformers==3.5.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install torch==1.4.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Extracting Embedding from BERT

In [3]:
from transformers import BertModel,BertTokenizer
import torch

In [4]:
model = BertModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [6]:
sentence = 'I love Paris'
tokens = tokenizer.tokenize(sentence)
print(tokens)

['i', 'love', 'paris']


In [7]:
tokens = ['[CLS]'] + tokens + ['[SEP]']
print(tokens)

['[CLS]', 'i', 'love', 'paris', '[SEP]']


In [8]:
tokens = tokens + ['[PAD]'] + ['[PAD]']
print(tokens)

['[CLS]', 'i', 'love', 'paris', '[SEP]', '[PAD]', '[PAD]']


In [9]:
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]
print(attention_mask)

[1, 1, 1, 1, 1, 0, 0]


In [10]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

[101, 1045, 2293, 3000, 102, 0, 0]


In [11]:
token_ids = torch.tensor(token_ids).unsqueeze(0)
attention_mask = torch.tensor(attention_mask).unsqueeze(0)

In [12]:
token_ids

tensor([[ 101, 1045, 2293, 3000,  102,    0,    0]])

In [13]:
attention_mask

tensor([[1, 1, 1, 1, 1, 0, 0]])

In [14]:
hidden_rep, cls_head = model(token_ids, attention_mask = attention_mask)

In [15]:
print(hidden_rep.shape)

torch.Size([1, 7, 768])


In [16]:
print(cls_head.shape)

torch.Size([1, 768])


# Extracting embedding from more than one layers of BERT

In [17]:
model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states = True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [18]:
sentence = 'I love Paris'
tokens = tokenizer.tokenize(sentence)
tokens = ['[CLS]'] + tokens + ['[SEP]']

In [19]:
tokens = tokens + ['[PAD]'] + ['[PAD]']
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [20]:
token_ids = torch.tensor(token_ids).unsqueeze(0)
attention_mask = torch.tensor(attention_mask).unsqueeze(0)

In [21]:
last_hidden_state, pooler_output, hidden_states = model(token_ids, attention_mask = attention_mask)

In [22]:
last_hidden_state.shape

torch.Size([1, 7, 768])

In [28]:
for index,state in enumerate(hidden_states):
  print(index,"",state.shape)

0  torch.Size([1, 7, 768])
1  torch.Size([1, 7, 768])
2  torch.Size([1, 7, 768])
3  torch.Size([1, 7, 768])
4  torch.Size([1, 7, 768])
5  torch.Size([1, 7, 768])
6  torch.Size([1, 7, 768])
7  torch.Size([1, 7, 768])
8  torch.Size([1, 7, 768])
9  torch.Size([1, 7, 768])
10  torch.Size([1, 7, 768])
11  torch.Size([1, 7, 768])
12  torch.Size([1, 7, 768])


In [29]:
pooler_output.shape

torch.Size([1, 768])