In [2]:
import torch
import scipy
from tokenizers import *
from transformers import *


## Get pretrained embeddings

In [3]:
MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
          (GPT2Model,       GPT2Tokenizer,       'gpt2'),
          (CTRLModel,       CTRLTokenizer,       'ctrl'),
          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'),
          (RobertaModel,    RobertaTokenizer,    'roberta-base'),
          (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
         ]

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] 

In [14]:
tokenizer.tokenize("Hello, my dog is cute")

['hello', ',', 'my', 'dog', 'is', 'cute']

In [16]:
last_hidden_states.shape

torch.Size([1, 8, 768])

In [17]:
last_hidden_states

tensor([[[-0.1144,  0.1937,  0.1250,  ..., -0.3827,  0.2107,  0.5407],
         [ 0.5308,  0.3207,  0.3665,  ..., -0.0036,  0.7579,  0.0388],
         [-0.4877,  0.8849,  0.4256,  ..., -0.6976,  0.4458,  0.1231],
         ...,
         [-0.7003, -0.1815,  0.3297,  ..., -0.4838,  0.0680,  0.8901],
         [-1.0355, -0.2567, -0.0317,  ...,  0.3197,  0.3999,  0.1795],
         [ 0.6080,  0.2610, -0.3131,  ...,  0.0311, -0.6283, -0.1994]]],
       grad_fn=<NativeLayerNormBackward>)

In [18]:
last_hidden_states.shape
# num_samples, num_tokens, embed_size

torch.Size([1, 8, 768])

In [9]:
last_hidden_states[0][7].shape

torch.Size([768])

In [12]:
last_hidden_states.mean(axis=1).squeeze().shape

torch.Size([768])

### Example cosine similarity

In [18]:
def get_sentence_similarity(s1, s2):
    ''' 
    Args:
        s1: str
        s2: str
    Returns:
        average similarity between the tokens of two vectors: float
    '''
    input_ids1 = torch.tensor(tokenizer.encode(s1)).unsqueeze(0)  # Batch size 1
    out1 = model(input_ids1)[0].mean(axis=1).squeeze().detach().numpy()
    input_ids2 = torch.tensor(tokenizer.encode(s2)).unsqueeze(0)  # Batch size 1
    out2 = model(input_ids2)[0].mean(axis=1).squeeze().detach().numpy()
    return scipy.spatial.distance.cosine(out1, out2)

In [27]:
# Taken from the SICK dataset
s1 = "The young boys are playing outdoors"
s2 = "A group of kids is playing in a yard"
get_sentence_similarity(s1,s2)

0.13384932279586792

In [26]:
s1 = "Two dogs are wrestling and hugging"
s2 = "There is no dog wrestling and hugging"
get_sentence_similarity(s1,s2)

0.23372882604599

### Compare with Bert-as-a-service


As you can see the above naive metric of just averaging the embeddings over all tokens in the input sequence does not do a great job. So we use this module to calculate sentence vectors - the creators of this library use various pooling strategies to compute a single vector for a sentence from all hidden layer representations.

In [32]:
# Run this after installing bert-serving-server and bert-serving-client by uncommenting the lines below
# !pip install bert-serving-server
# !pip install bert-serving-client

# Now download one of the bert models and run the bert server. Here I load the bert-base-cased
# Note that this server needs TF 1.10<= version < 2.0, so I started the server on another virtualenv 
# !bert-serving-start -model_dir ~/Downloads/cased_L-12_H-768_A-12/ -num_worker=4 

In [34]:
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
from bert_serving.client import BertClient
bc = BertClient()


In [42]:
sentences = bc.encode(["Two dogs are wrestling and hugging", "There is no dog wrestling and hugging"])

In [43]:
sentences.shape

(2, 768)

In [44]:
cosine_similarity(sentences[0][:].reshape(1,-1),sentences[1][:].reshape(1,-1))

array([[0.9468931]], dtype=float32)

In [46]:
sentences = bc.encode(["The young boys are playing outdoors", "A group of kids is playing in a yard"])
cosine_similarity(sentences[0][:].reshape(1,-1),sentences[1][:].reshape(1,-1))

array([[0.9492906]], dtype=float32)

In [47]:
sentences = bc.encode(["I went to the mall today!", "Let's do some shopping in the afternoon."])
cosine_similarity(sentences[0][:].reshape(1,-1),sentences[1][:].reshape(1,-1))

array([[0.89849055]], dtype=float32)

## Training your own sequence classsification model

General Language Understanding Evaluation (GLUE) benchmark is a collection of nine sentence- or sentence-pair language understanding tasks for evaluating and analyzing natural language understanding systems.

For the purpose of the demo, we'll train the MRPC (Microsoft Research Paraphrase Corpus) to classify whether a sentence is a paraphrase of another sentence or not.

### Load model and data

In [75]:
import tensorflow_datasets
import tensorflow as tf
# Be sure that the TF version is 2.0 not 2.1, Transformers library is not tested for 2.1 and will throw errors

In [80]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
data = tensorflow_datasets.load('glue/mrpc')

train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2).take(200)
valid_dataset = valid_dataset.batch(64).take(20)

INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset glue (/Users/skhurana/tensorflow_datasets/glue/mrpc/1.0.0)
INFO:absl:Constructing tf.data.Dataset for split None, from /Users/skhurana/tensorflow_datasets/glue/mrpc/1.0.0


### Train the model

In [81]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train and evaluate using tf.keras.Model.fit()
history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
                    validation_data=valid_dataset, validation_steps=7)

Train for 115 steps, validate for 7 steps
Epoch 1/2
Epoch 2/2






### Save the model

In [83]:
model.save_pretrained('./model/')
pytorch_model = BertForSequenceClassification.from_pretrained('./model/', from_tf=True)

### Make predictions

In [85]:
sentence_0 = "This research was consistent with his findings."
sentence_1 = "His findings were compatible with this research."
sentence_2 = "His findings were not compatible with this research."
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')

In [90]:
pred_1 = pytorch_model(inputs_1['input_ids'], token_type_ids=inputs_1['token_type_ids'])[0].argmax().item()
pred_2 = pytorch_model(inputs_2['input_ids'], token_type_ids=inputs_2['token_type_ids'])[0].argmax().item()

In [93]:
pred_1, pred_2
# 1 means it is a paraphrase, 0 means it's not
# predictions might be not be accurate because of limited sample size and training on CPU
# Proves the importance of GPU

(1, 1)

## Multilingual BERT

In [95]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=569.0, style=ProgressStyle(description_…




In [97]:
# Spanish example: I tried to translate "Let's go to the mall today"
tokenizer.tokenize("Hola quieres ir al centro comercial hoy")

['Ho', '##la', 'quiere', '##s', 'ir', 'al', 'centro', 'comercial', 'hoy']

In [98]:
# Hindi example
tokenizer.tokenize("चलो आज कहीं चलते हैं")

['च', '##ल', '##ो', 'आज', 'क', '##ही', '##ं', 'च', '##ल', '##ते', 'हैं']

In [96]:
input_ids = torch.tensor(tokenizer.encode(["Hola quieres ir al centro comercial hoy"])).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] 

In [99]:
last_hidden_states.shape

torch.Size([1, 3, 768])

### Masked LM

We'll try to predict the masked tokens in a piece of text with MaskedLM

In [119]:
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [132]:
text = """
The embedding only happens in the bottom-most encoder. The abstraction that is common to all the encoders is that they receive a list of vectors each of the size 512 – In the bottom encoder that would be the word embeddings, but in other encoders, it would be the output of the encoder that’s directly below. The size of this list is hyperparameter we can set – basically it would be the length of the longest sentence in our training dataset."
"""

tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

['The', 'em', '##bed', '##ding', 'only', 'happens', 'in', 'the', 'bottom', '-', 'most', 'en', '##code', '##r', '.', 'The', 'abstract', '##ion', 'that', 'is', 'common', 'to', 'all', 'the', 'en', '##code', '##rs', 'is', 'that', 'they', 'receive', 'a', 'list', 'of', 'vector', '##s', 'each', 'of', 'the', 'size', '512', '[UNK]', 'In', 'the', 'bottom', 'en', '##code', '##r', 'that', 'would', 'be', 'the', 'word', 'em', '##bed', '##ding', '##s', ',', 'but', 'in', 'other', 'en', '##code', '##rs', ',', 'it', 'would', 'be', 'the', 'output', 'of', 'the', 'en', '##code', '##r', 'that', '[UNK]', 's', 'directly', 'below', '.', 'The', 'size', 'of', 'this', 'list', 'is', 'hy', '##per', '##para', '##meter', 'we', 'can', 'set', '[UNK]', 'basic', '##ally', 'it', 'would', 'be', 'the', 'length', 'of', 'the', 'longest', 'sentence', 'in', 'our', 'training', 'data', '##set', '.', '"']


In [133]:
target = "happens"
masked_index = tokenized_text.index(target)
tokenized_text[masked_index] = '[MASK]'

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [1] * len(tokenized_text)
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Load pre-trained model (weights)
# model should be in the environment before running this function
# Predict all tokens
predictions = model(tokens_tensor, segments_tensors)
predicted_index = torch.argmax(predictions[0][0, masked_index, :]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])
print("Original:", text)
print("Masked:", " ".join(tokenized_text))
print("Predicted token:", predicted_token)
print("Other options:")
# just curious about what the next few options look like.
for i in range(10):
    predictions[0][0,masked_index,predicted_index] = -11100000
    predicted_index = torch.argmax(predictions[0][0, masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])
    print(predicted_token)

Original: 
The embedding only happens in the bottom-most encoder. The abstraction that is common to all the encoders is that they receive a list of vectors each of the size 512 – In the bottom encoder that would be the word embeddings, but in other encoders, it would be the output of the encoder that’s directly below. The size of this list is hyperparameter we can set – basically it would be the length of the longest sentence in our training dataset."

Masked: The em ##bed ##ding only [MASK] in the bottom - most en ##code ##r . The abstract ##ion that is common to all the en ##code ##rs is that they receive a list of vector ##s each of the size 512 [UNK] In the bottom en ##code ##r that would be the word em ##bed ##ding ##s , but in other en ##code ##rs , it would be the output of the en ##code ##r that [UNK] s directly below . The size of this list is hy ##per ##para ##meter we can set [UNK] basic ##ally it would be the length of the longest sentence in our training data ##set . "
Pre

In [116]:
predictions[0].shape

torch.Size([1, 113, 768])

In [117]:
predictions[1].shape

torch.Size([1, 768])

In [123]:
len(predictions)

1