In [92]:
import torch
import re

In [2]:
from transformers import BertTokenizer, BertModel, BertForMaskedLM

In [63]:
# Load pre-trained model tokenizer (vocabulary)
modelpath = "bert-large-uncased"
tokenizer = BertTokenizer.from_pretrained(modelpath)

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [64]:
# sentence and target to predict
text = "dummy. although he had already eaten a large meal, he was still very hungry."
target = "hungry"

In [65]:
# Tokenize a sentence
tokenized_text = tokenizer.tokenize(text)
tokenized_text

['dummy',
 '.',
 'although',
 'he',
 'had',
 'already',
 'eaten',
 'a',
 'large',
 'meal',
 ',',
 'he',
 'was',
 'still',
 'very',
 'hungry',
 '.']

In [66]:
# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = tokenized_text.index(target)
tokenized_text[masked_index] = '[MASK]'
tokenized_text

['dummy',
 '.',
 'although',
 'he',
 'had',
 'already',
 'eaten',
 'a',
 'large',
 'meal',
 ',',
 'he',
 'was',
 'still',
 'very',
 '[MASK]',
 '.']

In [67]:
# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
indexed_tokens

[24369,
 1012,
 2348,
 2002,
 2018,
 2525,
 8828,
 1037,
 2312,
 7954,
 1010,
 2002,
 2001,
 2145,
 2200,
 103,
 1012]

In [68]:
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [1] * len(tokenized_text)
# this is for the dummy first sentence. 
segments_ids[0] = 0
segments_ids[1] = 0
segments_ids

[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [69]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
#tokens_tensor
#segments_tensors

In [70]:
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained(modelpath)
model.eval()

HBox(children=(IntProgress(value=0, description='Downloading', max=314, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=1344997306, style=ProgressStyle(description…




BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12, elementw

In [71]:
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda')

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12, elementw

In [72]:
# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

#predicted_index = torch.argmax(predictions[0, masked_index]).item()
#predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])

print("Original:", text)
print("Masked:", " ".join(tokenized_text))

print("Predicted token:", predicted_token)
print("Other options:")

Original: dummy. although he had already eaten a large meal, he was still very hungry.
Masked: dummy . although he had already eaten a large meal , he was still very [MASK] .
Predicted token: young
Other options:


In [73]:
text = "Mia Farrow [MASK] voice actor [MASK] The Last Unicorn [SEP] The Last Unicorn [MASK] character role [MASK] The Unicorn [SEP]"#" instance of [MASK] unicorn in a fictional work [MASK] different from [MASK] Unicorn [MASK] named after [MASK] Unicorn"

In [74]:
tokenizer.encode(text, add_special_tokens=True)

[101,
 8764,
 2521,
 10524,
 103,
 2376,
 3364,
 103,
 1996,
 2197,
 21830,
 1031,
 19802,
 1033,
 1996,
 2197,
 21830,
 103,
 2839,
 2535,
 103,
 1996,
 21830,
 1031,
 19802,
 1033,
 102]

In [81]:
del segments_tensors

In [103]:
# sentence and target to predict
#text = "Michelle Obama [MASK] spouse [MASK] Barack Obama"
#text = "Michelle Obama TO spouse TO Barack Obama"
#text = "Michelle Obama [MASK] spouse [MASK] Barack Obama"
#tokenized_text = ['mia farrow', '[MASK]', 'voice', 'actor', '[MASK]', 'the', 'last', 'unicorn']
text_1 = "[MASK] voice actor [MASK]" #""[SEP] The Last Unicorn [MASK] character role [MASK] The Unicorn [SEP]"#" instance of [MASK] unicorn in a fictional work [MASK] different from [MASK] Unicorn [MASK] named after [MASK] Unicorn"tokenizer.encode(text, add_special_tokens=True)
text_2 = "[MASK] character role [MASK]"
#tokenized_text = tokenizer.encode(text, add_special_tokens=True)
tokenized_text_1 = tokenizer.tokenize(text_1)
tokenized_text_2 = tokenizer.tokenize(text_2)
tokenized_text = ['[CLS]'] + ['Mia Farrow'] +tokenized_text_1 + ['The Last Unicorn'] + tokenized_text_2 + ['The Unicorn'] + ['[SEP]']
tokenized_text = ['Mia Farrow'] +tokenized_text_1 + ['The Last Unicorn'] + tokenized_text_2 + ['The Unicorn']

#text = "Mia Farrow [MASK] voice actor [MASK] The Last Unicorn [MASK] character role [MASK] The Unicorn [MASK] [MASK] [MASK] unicorn in a fictional work [MASK] different from [MASK] Unicorn [MASK] named after [MASK] Unicorn"
#tokenized_text = tokenizer.tokenize(text)

text = "Mia Farrow [MASK] voice actor [MASK] The Last Unicorn [MASK] character role [MASK] The Unicorn"
tokenized_text = tokenizer.tokenize(text)

raw_text = ['Mia Farrow', '[MASK]', 'voice actor', '[MASK]', 'The Last Unicorn', '[MASK]', 'character role', '[MASK]', 'The Unicorn']
tokenized_text = tokenizer.tokenize(" ".join(raw_text))

raw_text = ['Mia Farrow', '[MASK]', 'voice actor', '[MASK]', 'The Last Unicorn', '[MASK]', 'character role', '[MASK]', 'The Unicorn', '[MASK]', 'The Unicorn', '[MASK]', 'instance of', '[MASK]', 'unicorn in a fictional work', '[MASK]', 'different from', '[MASK]', 'Unicorn', '[MASK]', 'named after', '[MASK]', 'Unicorn', '[MASK]']
tokenized_text = tokenizer.tokenize(" ".join(raw_text))

#print("tokenized_text",tokenized_text)
#masked_index = tokenized_text.index("MASK")
#print(masked_index)
#tokenized_text[masked_index] = '[MASK]'
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
print("indexed_tokens",indexed_tokens)

# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [1] * len(tokenized_text)
# this is for the dummy first sentence. 
#segments_ids[0] = 0
#segments_ids[1] = 0
#segments_ids[2] = 0
#segments_ids[3] = 0
#segments_ids[4] = 0
#segments_ids[1] = 0
for i in range(0,11+1):
    segments_ids[i] = 0
print(segments_ids)

tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

model = BertForMaskedLM.from_pretrained(modelpath)
model.eval()

tokens_tensor = tokens_tensor.to(torch.device('cuda:1'))
segments_tensors = segments_tensors.to(torch.device('cuda:1'))
model.to(torch.device('cuda:1'))

masked_indices = [i for i, x in enumerate(tokenized_text) if x == "[MASK]"]
masked_indices_raw = [i for i, x in enumerate(raw_text) if x == "[MASK]"]

with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

completed_text = raw_text.copy()
print("source:", " ".join(completed_text))

for i_mi, masked_index in enumerate(masked_indices):
    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    completed_text[masked_indices_raw[i_mi]] = predicted_token

completed_text = " ".join(completed_text)
completed_text = re.sub(r'\s*,\s*', ', ', completed_text)
completed_text = re.sub(r'\s*\.\s*', '. ', completed_text)
completed_text = re.sub(r'\s*\(\s*', ' (', completed_text)
completed_text = re.sub(r'\s*\)\s*', ') ', completed_text)

print("completed:",completed_text)

indexed_tokens [8764, 2521, 10524, 103, 2376, 3364, 103, 1996, 2197, 21830, 103, 2839, 2535, 103, 1996, 21830, 103, 1996, 21830, 103, 6013, 1997, 103, 21830, 1999, 1037, 7214, 2147, 103, 2367, 2013, 103, 21830, 103, 2315, 2044, 103, 21830, 103]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
source: Mia Farrow [MASK] voice actor [MASK] The Last Unicorn [MASK] character role [MASK] The Unicorn [MASK] The Unicorn [MASK] instance of [MASK] unicorn in a fictional work [MASK] different from [MASK] Unicorn [MASK] named after [MASK] Unicorn [MASK]
completed: Mia Farrow (voice actor) The Last Unicorn, character role of The Unicorn. The Unicorn an instance of a unicorn in a fictional work, different from the Unicorn, named after the Unicorn, 


In [76]:
#masked_index = tokenized_text.index("[MASK]")
#masked_index = 4
masked_indices = [i for i, x in enumerate(tokenized_text) if x == "[MASK]"]
masked_indices

[3, 6, 10, 13, 16, 17, 18, 24, 27, 29, 32]

In [77]:
# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

completed_text = tokenized_text.copy()
print("source:", " ".join(completed_text))

for masked_index in masked_indices:
    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    completed_text[masked_index] = predicted_token

    
print("completed:"," ".join(completed_text))
#predicted_index = torch.argmax(predictions[0, masked_index]).item()
#predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])

#print(masked_predications)

#print("Original:", text)
#print("Masked:", " ".join(tokenized_text))
#
#print("Predicted token:", predicted_token)
#print("Other options:")

RuntimeError: Expected object of device type cuda but got device type cpu for argument #2 'mat2' in call to _th_mm

In [23]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Encode a text inputs
text = "Who was Jim Henson ? Jim Henson was a"
text = "the president of the United States"
indexed_tokens = tokenizer.encode(text)

# Convert indexed tokens in a PyTorch tensor
tokens_tensor = torch.tensor([indexed_tokens])

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /data/users/romain.claret/.cache/torch/transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at /data/users/romain.claret/.cache/torch/transformers/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda


In [38]:
# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
model.to('cuda')

# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0]

# get the predicted next sub-word (in our case, the word 'man')
predicted_index = torch.argmax(predictions[0, 3, :]).item()
predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
#assert predicted_text == 'Who was Jim Henson? Jim Henson was a man'

print(predicted_text)
#print(predicted_index)
#print(predictions)

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at /data/users/romain.claret/.cache/torch/transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.085d5f6a8e7812ea05ff0e6ed0645ab2e75d80387ad55c1ad9806ee70d272f80
INFO:transformers.configuration_utils:Model config {
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "finetuning_task": null,
  "initializer_range": 0.02,
  "is_decoder": false,
  "layer_norm_epsilon": 1e-05,
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torchscript": false,
  "use_bfloat16": false,
  "vocab_size": 50257


the president of the United States United
