In [2]:
import numpy as np
import pandas as pd

import torch
from transformers import BertModel, BertTokenizer

import random
import warnings
warnings.filterwarnings('ignore')

## Loading the Pretrained BERT model

In [3]:
# Load pretrained model/tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained('bert-base-cased', output_hidden_states=True, cache_dir ='../cache') # Will output all hidden_states.

## Loading sentences

In [4]:
filepath = './data/sample.txt'
with open(filepath) as f:
    sentences = f.readlines()
    
# Example.
random.seed(42)
print("Example:\n    {}".format(random.choice(sentences)))

Example:
    • Changes in membership of any endpoint to the device pool or association of any endpoint to the redundancy group.



### Tokenization

In [5]:
tokenized = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

# Example.
random.seed(42)
print("Example:\n    {}".format(random.choice(tokenized)))

Example:
    [101, 794, 21395, 1107, 5467, 1104, 1251, 1322, 7587, 1106, 1103, 4442, 4528, 1137, 3852, 1104, 1251, 1322, 7587, 1106, 1103, 1894, 22902, 7232, 1372, 119, 102]


### Padding

In [6]:
# Define length of longest sentence in our dataset.
max_len = 0
for i in tokenized:
    if len(i) > max_len:
        max_len = len(i)
print("Maximum length: {}".format(max_len))
        
# Pad each tokenized sentence according to the maximum length.
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized])

# Example.
random.seed(42)
print("Example:\n    {}".format(random.choice(padded)))

Maximum length: 123
Example:
    [  101   794 21395  1107  5467  1104  1251  1322  7587  1106  1103  4442
  4528  1137  3852  1104  1251  1322  7587  1106  1103  1894 22902  7232
  1372   119   102     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0]


### Masking

In [7]:
attention_mask = np.where(padded != 0, 1, 0)  #returns ndarray which is 1 if padded != 0 is True and 0 if False.

# Example.
random.seed(42)
print("Example:\n    {}".format(random.choice(attention_mask)))

Example:
    [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0]


### Convert to torch tensors

In [8]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

### Pass the input to BERT

In [11]:
with torch.no_grad():
    # output is a 2-tuple where:
    #  - output[0] is the last_hidden_state, i.e a tensor of shape (batch_size, sequence_length, hidden_size).
    #  - output[1] is the pooler_output, i.e. a tensor of shape (batch_size, hidden_size) being the last layer hidden-state of the first token of the sequence (classification token).
    #  - output[2] are all hidden_states, i.e. a 13-tuple of torch tensors of shape (batch_size, sequence_length, hidden_size): 12 encoders-outputs + initial embedding outputs.
    output = model(input_ids, attention_mask=attention_mask) 

# Get individual components of the output.
last_hidden_states = output[0]
pooler_output = output[1]
hidden_states = output[2]

# Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor.
hidden_states = torch.stack(hidden_states, dim=0)

# Print dimensions of output.
print("Dimensions of hidden_states: {}".format(hidden_states.size()))
print("   - Number of layers (+1 with initial token embeddings): {}".format(hidden_states.size(0)))
print("   - Number of sentences: {}".format(hidden_states.size(1)))
print("   - Number of tokens in a sentence: {}".format(hidden_states.size(2)))
print("   - Dimension of an embedding : {}".format(hidden_states.size(3)))

Dimensions of hidden_states: torch.Size([13, 370, 123, 768])
   - Number of layers (+1 with initial token embeddings): 13
   - Number of sentences: 370
   - Number of tokens in a sentence: 123
   - Dimension of an embedding : 768
