In [2]:
import torch
device='cuda' if torch.cuda.is_available() else 'cpu' 
device

'cuda'

In [3]:
from typing import List

import torch
from transformers import BertForMaskedLM, BertTokenizer, GPT2Tokenizer

from bertram import BertramWrapper


2022-11-23 09:09:11.947789: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-23 09:09:15.746388: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-23 09:09:27.698358: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /share/apps/rc/software/CUDA/9.2.88-GCC-7.3.0-2.30/extras/CUPTI/lib64:/share/apps/rc/software/CUDA/9.2.88-GCC-7.3.0-2.30/lib64:/share/apps/rc/software/binutils/2.30-GCCcore-7.3.0/lib:/share/apps/rc/software/GCCco

In [4]:
def predict(inp: str, model: BertForMaskedLM, tokenizer: BertTokenizer, k: int = 3) -> List[str]:
    """
    Predict the top-k substitutes for an input text containing a single MASK token.
    :param inp: the input text
    :param model: a masked language model
    :param tokenizer: the tokenizer corresponding to the model
    :param k: the number of predictions
    :return: the list of top-k substitutes for the MASK token
    """
    kwargs = {'add_prefix_space': True} if isinstance(tokenizer, GPT2Tokenizer) else {}
    input_ids = tokenizer.encode(inp, add_special_tokens=True, **kwargs)
    mask_idx = input_ids.index(tokenizer.mask_token_id)
    input_ids = torch.tensor([input_ids])

    with torch.no_grad():
        (predictions,) = model(input_ids)

    predicted_tokens = []
    _, predicted_indices = torch.topk(predictions[0, mask_idx], k)

    for predicted_index in predicted_indices:
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index.item()])[0]
        predicted_tokens.append(predicted_token)
    return predicted_tokens



In [5]:

# load a pre-trained BERTRAM model and the corresponding BERT model
#bert_config_fused = BertConfig.from_json_file('/data/user/home/sungman/test/NLP Group Project/bertram-master/outputs/BERT_fused_test/bertram_config.json')
'''
bert_config_fused = BertConfig.from_json_file('/data/user/home/sungman/test/NLP Group Project/bertram-master/outputs/BERT_form-e10/bertram_config.json')

bert_config_fused.output_hidden_states = True
bert_config_fused.vocab_size = 32000

fused_model = BertForSequenceClassification(bert_config_fused)
fused_model.to(device=device)
'''
fused_model = BertramWrapper('/data/user/home/sungman/test/NLP Group Project/bertram-master/outputs/BERT_form-e10/', device=device)

bert = BertForMaskedLM.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

words_with_contexts = {
    'kumquat': ['litchi, pineapple and kumquat is planned for the greenhouse.', 'kumquat and cranberry sherbet'],
    'resigntaion': []
}

# infer a BERTRAM vector for a single word from it's surface form and contexts
print(f'BERTRAM vector for "kumquat": {fused_model.infer_vector("kumquat", words_with_contexts["kumquat"])[:5]}')

# infer BERTRAM vectors for all words and add them to the transformer's embedding matrix
# for each word `w`, this creates a new token `<BERTRAM:w>` that can be used like a regular word
fused_model.add_word_vectors_to_model(words_with_contexts, tokenizer, bert)

inputs_bert = ["a kumquat is a [MASK].", "'resigntaion' is a misspelling of '[MASK]'."]
inputs_bertram = ["a <BERTRAM:kumquat> is a [MASK].", "'<BERTRAM:resigntaion>' is a misspelling of '[MASK]'."]

for input_bert, input_bertram in zip(inputs_bert, inputs_bertram):
    bert_predictions = predict(input_bert, bert, tokenizer)
    bertram_predictions = predict(input_bertram, bert, tokenizer)
    print(f'Input: {input_bert} \n\tBERT:    {bert_predictions}\n\tBERTRAM: {bertram_predictions}\n')


2022-11-23 09:09:57,203 - INFO - tokenization_utils - loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/sungman/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
2022-11-23 09:09:57,269 - INFO - ngram_models - Found 712 ngrams with min count 4 and (nmin,nmax)=(3,5), first 10: ['UNK', 'PAD', 'ed<S>', 'ng<S>', 'ing', 'er<S>', 'ing<S>', 'on<S>', '<S>co', 'ion'], last 10: ['tly', 'tly<S>', 'cor', 'anc', 'ance', 'ance<S>', '<S>des', 'des', 'sio', 'sion']
2022-11-23 09:09:57,270 - INFO - utils - Loading embeddings from ./fcm/wordEmbeddings/glove.6B.50d.txt
2022-11-23 09:29:42,352 - INFO - utils - Done loading embeddings
2022-11-23 09:29:42,357 - INFO - configuration_utils - loading configuration file /data/user/home/sungman/test/NLP Group Project/bertram-master/outputs/BERT_form-e10/config.json
2022-11-23 09:29:42,358 - INFO 

BERTRAM vector for "kumquat": tensor([ 0.2446,  0.4382,  0.1391, -0.2657,  0.4427], device='cuda:0')


  torch.tensor(embedding, requires_grad = True)


Input: a kumquat is a [MASK]. 
	BERT:    ['noun', 'horse', 'dog']
	BERTRAM: ['word', 'letter', 'term']

Input: 'resigntaion' is a misspelling of '[MASK]'. 
	BERT:    ['john', 'king', 'son']
	BERTRAM: ['water', 's', '[UNK]']



In [6]:
fused_model = BertramWrapper('/data/user/home/sungman/test/NLP Group Project/bertram-master/outputs/BERT_fused_test-e1/', device=device)

bert = BertForMaskedLM.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

words_with_contexts = {
    'kumquat': ['litchi, pineapple and kumquat is planned for the greenhouse.', 'kumquat and cranberry sherbet'],
    'resigntaion': []
}

# infer a BERTRAM vector for a single word from it's surface form and contexts
print(f'BERTRAM vector for "kumquat": {fused_model.infer_vector("kumquat", words_with_contexts["kumquat"])[:5]}')

# infer BERTRAM vectors for all words and add them to the transformer's embedding matrix
# for each word `w`, this creates a new token `<BERTRAM:w>` that can be used like a regular word
fused_model.add_word_vectors_to_model(words_with_contexts, tokenizer, bert)

inputs_bert = ["a kumquat is a [MASK].", "'resigntaion' is a misspelling of '[MASK]'."]
inputs_bertram = ["a <BERTRAM:kumquat> is a [MASK].", "'<BERTRAM:resigntaion>' is a misspelling of '[MASK]'."]

for input_bert, input_bertram in zip(inputs_bert, inputs_bertram):
    bert_predictions = predict(input_bert, bert, tokenizer)
    bertram_predictions = predict(input_bertram, bert, tokenizer)
    print(f'Input: {input_bert} \n\tBERT:    {bert_predictions}\n\tBERTRAM: {bertram_predictions}\n')


2022-11-23 09:29:57,535 - INFO - tokenization_utils - loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/sungman/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
2022-11-23 09:29:57,588 - INFO - ngram_models - Found 712 ngrams with min count 4 and (nmin,nmax)=(3,5), first 10: ['UNK', 'PAD', 'ed<S>', 'ng<S>', 'ing', 'er<S>', 'ing<S>', 'on<S>', '<S>co', 'ion'], last 10: ['tly', 'tly<S>', 'cor', 'anc', 'ance', 'ance<S>', '<S>des', 'des', 'sio', 'sion']
2022-11-23 09:29:57,589 - INFO - utils - Loading embeddings from ./fcm/wordEmbeddings/glove.6B.50d.txt
2022-11-23 09:49:28,559 - INFO - utils - Done loading embeddings
2022-11-23 09:49:28,569 - INFO - configuration_utils - loading configuration file /data/user/home/sungman/test/NLP Group Project/bertram-master/outputs/BERT_fused_test-e1/config.json
2022-11-23 09:49:28,571 - 

<class 'torch.Tensor'>
<class 'torch.Tensor'>
BERTRAM vector for "kumquat": tensor([-0.0109,  1.0750, -0.2437,  0.1992,  0.6432], device='cuda:0')
<class 'torch.Tensor'>
<class 'torch.Tensor'>
Input: a kumquat is a [MASK]. 
	BERT:    ['noun', 'horse', 'dog']
	BERTRAM: ['word', 'name', 'letter']

Input: 'resigntaion' is a misspelling of '[MASK]'. 
	BERT:    ['john', 'king', 'son']
	BERTRAM: ['water', 's', '[UNK]']

