In [47]:
%load_ext autoreload
%autoreload 2
import numpy as np
out_dir = '../../results/nlp'
os.makedirs(out_dir, exist_ok=True)

import torch
from pytorch_transformers import BertTokenizer, BertForMaskedLM, BertForSequenceClassification, BertConfig

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
def pred(s: str, tokenizer, model):
    '''Predict for a string given tokenizer and model (returns class 1)
    '''
    with torch.no_grad():
        print('encoded', tokenizer.convert_ids_to_tokens(tokenizer.encode(s)))
        input_ids = torch.tensor(tokenizer.encode(s)).unsqueeze(0)  # Batch size 1
        pred = model(input_ids)[0].detach().softmax(dim=1).numpy().flatten()
    return pred

In [82]:
mdir = 'bert-base-uncased' # '/scratch/users/vision/chandan/pacmed/glue/SST-2-middle/'
config = BertConfig.from_pretrained(mdir)
tokenizer = BertTokenizer.from_pretrained(mdir)
clf = BertForSequenceClassification(config).eval()
masked_predictor = BertForMaskedLM.from_pretrained(mdir).eval()

In [73]:
text = 'amazing, wonderful, great movie'
text = 'terrible awful movie'
text = 'The actors are terrible.'

In [74]:
pred(text, tokenizer, clf)

encoded ['the', 'actors', 'are', 'terrible', '.']


array([0.5856249, 0.4143751], dtype=float32)

In [83]:
def predict_masked(masked_predictor, tokenizer,
                   text="[CLS] This great movie was very good . [SEP] I thoroughly enjoyed it [SEP]",
                   masked_index=6):
    tokenized_text = tokenizer.tokenize(text)

    # Mask a token that we will try to predict back with `BertForMaskedLM`
    tokenized_text[masked_index] = '[MASK]'
    print(tokenized_text)
    assert tokenized_text == ['[CLS]', 'this', 'great', 'movie', 'was', 'very', '[MASK]', '.', 
                              '[SEP]', 'i', 'thoroughly', 'enjoyed', 'it', '[SEP]']

    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segment_ids = np.zeros(len(tokenized_text))
    segment_ids[7:] = 1

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    # If you have a GPU, put everything on cuda
    tokens_tensor = tokens_tensor.to('cuda')
    segments_tensors = segments_tensors.to('cuda')
    masked_predictor.to('cuda')

    # Predict all tokens
    with torch.no_grad():
        outputs = masked_predictor(tokens_tensor, token_type_ids=segments_tensors)
        predictions = outputs[0]

    preds_masked = predictions[0, masked_index].detach().cpu().numpy()
    inds_max = np.argsort(preds_masked)[::-1]
    return inds_max, preds_masked

text = "[CLS] This great movie was very good . [SEP] I thoroughly enjoyed it [SEP]"
inds_max, preds_masked = predict_masked(masked_predictor,
                                        tokenizer,
                                        text=text,
                                        masked_index=6)
for ind in inds_max[:20]:
    print(f'{tokenizer.convert_ids_to_tokens([ind])[0]}\t{preds_masked[ind]:0.2f}')

# tokenizer.convert_ids_to_tokens(inds_max[::-1])[:10]


['[CLS]', 'this', 'great', 'movie', 'was', 'very', '[MASK]', '.', '[SEP]', 'i', 'thoroughly', 'enjoyed', 'it', '[SEP]']
entertaining	10.70
good	10.58
popular	9.66
successful	9.62
exciting	9.60
enjoyable	9.29
funny	9.28
interesting	9.02
satisfying	8.63
beautiful	8.33
special	8.33
fun	8.22
impressive	8.04
important	7.74
wonderful	7.55
emotional	7.52
moving	7.49
effective	7.47
engaging	7.40
original	7.39
