<a href="https://colab.research.google.com/github/WenquanZou/actor_assistant/blob/BERT_example/demo_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library installation

In [2]:
!pip install transformers
import torch
from transformers import BertTokenizer, BertModel

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ee/fc/bd726a15ab2c66dc09306689d04da07a3770dad724f0883f0a4bfb745087/transformers-2.4.1-py3-none-any.whl (475kB)
[K     |████████████████████████████████| 481kB 2.9MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 45.6MB/s 
[?25hCollecting tokenizers==0.0.11
[?25l  Downloading https://files.pythonhosted.org/packages/5e/36/7af38d572c935f8e0462ec7b4f7a46d73a2b3b1a938f50a5e8132d5b2dc5/tokenizers-0.0.11-cp36-cp36m-manylinux1_x86_64.whl (3.1MB)
[K     |████████████████████████████████| 3.1MB 37.7MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K  

#2 Preprocessing


## 2.1 Data collection

In [0]:
# TODO: Download data here

## 2.2 Parse


In [0]:
# An example of parsing raw input into BERT acceptable input
# TODO: Parse data
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# An example of BERT input
text_dataset = ["After stealing money from the bank vault, the bank robber was seen " \
       "fishing on the Mississippi river bank."]

inputs = [tokenizer.encode_plus(text_input, add_special_tokens = True, pad_to_max_length=True) for text_input in text_dataset]

input_ids = [d['input_ids'] for d in inputs]
input_segments = [d['token_type_ids'] for d in inputs]
# input_attention_masks = [d['attention_mask'] for d in inputs]


# 3 Implementation of models

In [8]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor(input_ids)
segments_tensor = torch.tensor(input_segments)
# attention_masks_tensor = torch.tensor(input_attention_masks)

# # Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-cased')

# # Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [0]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 199 different named parameters.

==== Embedding Layer ====

embeddings.word_embeddings.weight                       (119547, 768)
embeddings.position_embeddings.weight                     (512, 768)
embeddings.token_type_embeddings.weight                     (2, 768)
embeddings.LayerNorm.weight                                   (768,)
embeddings.LayerNorm.bias                                     (768,)

==== First Transformer ====

encoder.layer.0.attention.self.query.weight               (768, 768)
encoder.layer.0.attention.self.query.bias                     (768,)
encoder.layer.0.attention.self.key.weight                 (768, 768)
encoder.layer.0.attention.self.key.bias                       (768,)
encoder.layer.0.attention.self.value.weight               (768, 768)
encoder.layer.0.attention.self.value.bias                     (768,)
encoder.layer.0.attention.output.dense.weight             (768, 768)
encoder.layer.0.attention.output.dense.bias                   

# 4 Model training

# 5 Model output
BERT model output a feature can be learn for later fine-tuning, which has (512, 768) dimension for just one sentence embedding.

In [11]:
# # Predict hidden states features for each layer
with torch.no_grad():
    last_hidden_states,_ = model(tokens_tensor, segments_tensor)

print(f"The shape of feature {last_hidden_states.shape} ")


print(last_hidden_states)

The shape of feature torch.Size([1, 512, 768]) 
tensor([[[-0.3431, -0.3021, -0.4826,  ..., -0.5792,  0.5865,  0.4388],
         [ 0.6419, -0.8556,  0.2585,  ..., -0.2039,  0.2330,  0.0675],
         [ 0.0347, -0.1806, -0.3851,  ..., -0.2327, -0.0462,  0.2904],
         ...,
         [-0.4802, -0.1300,  0.0429,  ..., -1.0763,  0.3405,  0.1989],
         [-0.4533, -0.3078, -0.0975,  ..., -1.0217,  0.2284, -0.0421],
         [-0.7667, -0.2916, -0.1075,  ..., -0.7291,  0.1455, -0.0245]]])
