In [47]:
%load_ext autoreload
%autoreload 2
import numpy as np
out_dir = '../../results/nlp'
os.makedirs(out_dir, exist_ok=True)

import torch
from pytorch_transformers import BertTokenizer, BertForMaskedLM, BertForSequenceClassification, BertConfig

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
def pred(s: str, tokenizer, model):
    '''Predict for a string given tokenizer and model (returns class 1)
    '''
    with torch.no_grad():
        print('encoded', tokenizer.convert_ids_to_tokens(tokenizer.encode(s)))
        input_ids = torch.tensor(tokenizer.encode(s)).unsqueeze(0)  # Batch size 1
        pred = model(input_ids)[0].detach().softmax(dim=1).numpy().flatten()
    return pred

In [32]:
mdir = '/scratch/users/vision/chandan/pacmed/glue/SST-2-middle/'
config = BertConfig.from_pretrained(mdir)
tokenizer = BertTokenizer.from_pretrained(mdir)
clf = BertForSequenceClassification(config).eval()
masked_predictor = BertForMaskedLM.from_pretrained(mdir).eval()

In [39]:
text = 'amazing, wonderful, great movie'
text = 'terrible awful movie'

In [40]:
pred(text, tokenizer, clf)

encoded ['terrible', 'awful', 'movie']


array([0.5204421 , 0.47955787], dtype=float32)

In [None]:
# Tokenize input
text = "[CLS] This great movie was very good . [SEP] I thoroughly enjoyed it [SEP]"
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 6
tokenized_text[masked_index] = '[MASK]'

assert tokenized_text == ['[CLS]', 'this', 'great', 'movie', 'was', 'very', '[MASK]', '.', 
                          '[SEP]', 'i', 'thoroughly', 'enjoyed', 'it', '[SEP]']



# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segment_ids = np.zeros(len(tokenized_text))
segment_ids[7:] = 1
# segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
masked_predictor.to('cuda')

# Predict all tokens
with torch.no_grad():
    outputs = masked_predictor(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token)
assert predicted_token == 'henson'


# sanity checks

In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained(mdir)

# Tokenize input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained(mdir)
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda')

# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

assert predicted_token == 'henson'

In [55]:
print(predicted_token)

crisp


In [59]:
inds_max = np.argsort(predictions[0, masked_index].detach().cpu())

In [61]:
inds_max

tensor([21844, 26217, 20688,  ...,  8897, 15558, 15594])

In [None]:
tokenizer.convert_ids_to_tokens(inds_max)