In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM

In [2]:
def get_model_and_tokenizer(model_class, model_name):
  CACHE_DIR = '/scratch/gpfs/kw1166/.cache/'
  lm_model = model_class.from_pretrained(
      model_name,
      output_hidden_states=True,
      local_files_only=True,
      cache_dir=CACHE_DIR
  )

  tokenizer = AutoTokenizer.from_pretrained(
      model_name,
      add_predix_space=True,
      local_files_only=True,
      cache_dir=CACHE_DIR,
      use_fast=False
  )

  return (lm_model, tokenizer)


In [3]:
# CACHE_DIR = '/scratch/gpfs/kw1166/.cache/'
# tokenizer_2 = GPT2Tokenizer.from_pretrained('gpt2-xl',
#                                           add_prefix_space=True,
#                                           cache_dir=CACHE_DIR)
# tokenizer_2.pad_token = tokenizer_2.eos_token

# lm_model_2 = GPT2LMHeadModel.from_pretrained("gpt2-xl",
#                                            output_hidden_states=True,
#                                            cache_dir=CACHE_DIR)

# lm_model, tokenizer = get_model_and_tokenizer(AutoModelForCausalLM, 'gpt2-xl')
# tokenizer.pad_token = tokenizer.eos_token

lm_model, tokenizer = get_model_and_tokenizer(AutoModelForMaskedLM, 'bert-base-cased')
lm_model2, tokenizer2 = get_model_and_tokenizer(AutoModelForMaskedLM, 'bert-large-uncased')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model 

In [4]:
lm_model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

### All sentences Batched

In [5]:
print(tokenizer.decode([100,101,102,103]))
print(tokenizer, tokenizer2)

[UNK] [CLS] [SEP] [MASK]
PreTrainedTokenizer(name_or_path='bert-base-cased', vocab_size=28996, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}) PreTrainedTokenizer(name_or_path='bert-large-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [6]:
sentences = ['[MASK]',
 'hello [MASK]',
 'hello world [MASK]',
 'hello world there [MASK]',
 'hello world there you [MASK]',
 'world there you are [MASK]'
 ]
# input_ids = tokenizer(sentences, padding=True, return_tensors='pt')
input_ids = tokenizer.batch_encode_plus(sentences, padding=True, return_tensors='pt')
# input_ids2 = tokenizer2.batch_encode_plus(sentences, padding=True, return_tensors='pt')

print(input_ids)
# print(input_ids2)
lm_outputs = lm_model(**input_ids)
transformer_hidden_states = lm_outputs[-1]

{'input_ids': tensor([[  101,   103,   102,     0,     0,     0,     0],
        [  101, 19082,   103,   102,     0,     0,     0],
        [  101, 19082,  1362,   103,   102,     0,     0],
        [  101, 19082,  1362,  1175,   103,   102,     0],
        [  101, 19082,  1362,  1175,  1128,   103,   102],
        [  101,  1362,  1175,  1128,  1132,   103,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1]])}


In [7]:
for a, b in zip(input_ids['input_ids'], input_ids['attention_mask']):
    print(a, b)

tensor([101, 103, 102,   0,   0,   0,   0]) tensor([1, 1, 1, 0, 0, 0, 0])
tensor([  101, 19082,   103,   102,     0,     0,     0]) tensor([1, 1, 1, 1, 0, 0, 0])
tensor([  101, 19082,  1362,   103,   102,     0,     0]) tensor([1, 1, 1, 1, 1, 0, 0])
tensor([  101, 19082,  1362,  1175,   103,   102,     0]) tensor([1, 1, 1, 1, 1, 1, 0])
tensor([  101, 19082,  1362,  1175,  1128,   103,   102]) tensor([1, 1, 1, 1, 1, 1, 1])
tensor([ 101, 1362, 1175, 1128, 1132,  103,  102]) tensor([1, 1, 1, 1, 1, 1, 1])


In [8]:
embeddings = transformer_hidden_states[-1]
print(embeddings)

tensor([[[ 0.1323,  0.3435,  0.1365,  ...,  0.2833, -0.2909,  0.3727],
         [ 0.4390, -0.0905,  0.2218,  ..., -0.3375,  0.3586,  0.3006],
         [ 1.1752,  0.4498, -0.6960,  ...,  0.1995,  0.8212, -0.1582],
         ...,
         [ 0.0307, -0.1238,  0.1624,  ..., -0.3096,  0.3233,  0.1251],
         [ 0.2535,  0.1223, -0.2642,  ..., -0.1411,  0.5176,  0.5266],
         [ 0.1693,  0.1635, -0.2344,  ..., -0.1314,  0.5437,  0.3968]],

        [[ 0.4933,  0.3707,  0.2677,  ..., -0.2835,  0.5303, -0.0569],
         [ 0.5455, -0.4865,  0.7949,  ..., -0.5479,  0.1502,  0.0675],
         [ 0.4508,  0.3423,  0.4286,  ..., -0.5160,  0.4282, -0.0413],
         ...,
         [ 0.1143,  0.0660,  0.3881,  ..., -0.2771,  0.4815,  0.0912],
         [ 0.1674,  0.0898,  0.4107,  ..., -0.3823,  0.4465,  0.0524],
         [ 0.3055,  0.4224,  0.3344,  ..., -0.3969,  0.4021, -0.0621]],

        [[ 0.3388,  0.4806,  0.3763,  ..., -0.2291,  0.5335, -0.1023],
         [ 0.5272, -0.5086,  0.7069,  ..., -0

In [9]:
attn_mask = input_ids['attention_mask']
attn_mask = attn_mask.unsqueeze(-1).expand(embeddings.shape)
attn_mask

tensor([[[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1

In [62]:
embeddings.shape, input_ids['attention_mask'].shape

(torch.Size([6, 7, 768]), torch.Size([6, 7]))

In [16]:
embeddings * attn_mask

tensor([[[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000]],

        [[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327],
         [-0.2217,  0.3650,  0.6309,  ..., -1.3968,  0.1588,  0.1691],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000]],

        [[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327],
         [-0.2217,  0.3650,  0.6309,  ..., -1.3968,  0.1588,  0.1691],
         [ 0.0775,  0.1162,  0.7933,  ..., -1.5378, -0.0938,  0.9396],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000,  0.0000],
  

In [22]:
import torch
torch.sum(embeddings * attn_mask, axis=1)

tensor([[  1.0588,   0.1301,   0.2188,  ...,  -4.4375,   0.7642,   0.2327],
        [  0.8371,   0.4952,   0.8497,  ...,  -5.8343,   0.9230,   0.4018],
        [  0.9147,   0.6114,   1.6430,  ...,  -7.3722,   0.8292,   1.3414],
        ...,
        [  2.4972,   0.7955,   0.9381,  ...,  -8.5987,   2.1432,  -0.4440],
        [  2.7568,   1.1913,   1.6058,  ..., -10.3531,   3.0460,   0.1953],
        [ -3.2951,  -0.4990,   1.1670,  ...,  -9.8724,   2.1661,  -0.0779]],
       grad_fn=<SumBackward1>)

In [30]:
sentences = [('Hello'),
             ('Hello', 'world'),
             ('Hello', 'world', 'hello'),
             ('Hello', 'world', 'there'),
             ('Hello', 'world', 'there', 'you'),
             ('Hello', 'world', 'there', 'you', 'are'),
            ('world', 'there', 'you', 'are', 'flying')]

In [31]:
# tokenizer(sentences, padding=False) # doesn't work
input_dict = tokenizer.batch_encode_plus(sentences, is_split_into_words=True, padding=True, return_tensors='pt')
input_dict

{'input_ids': tensor([[18435, 50256, 50256, 50256, 50256],
        [18435,   995, 50256, 50256, 50256],
        [18435,   995, 23748, 50256, 50256],
        [18435,   995,   612, 50256, 50256],
        [18435,   995,   612,   345, 50256],
        [18435,   995,   612,   345,   389],
        [  995,   612,   345,   389,  7348]]), 'attention_mask': tensor([[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}

In [34]:
import numpy as np
import torch

myvec =  transformer_hidden_states[-1]
myvec2 = myvec/torch.norm(myvec, p=2)
myvec2

In [53]:
# tok_to_str = tokenizer.batch_decode(input_ids['input_ids'],
#                                     skip_special_tokens=True)

### One sentence at a time

In [84]:

import torch
import torch.utils.data as data
windows = [(101,19082,103,102)]
input_ids = torch.tensor(windows)

data_dl = data.DataLoader(input_ids, batch_size=1, shuffle=False)
for batch_idx, batch in enumerate(data_dl):
    if batch_idx == 0:
        # batch = batch.to("cuda")
        model_output = lm_model(batch)
transformer_hidden_states = model_output[-1]
print(transformer_hidden_states[-1].shape)
print(transformer_hidden_states[-1])

torch.Size([1, 4, 768])
tensor([[[ 0.4933,  0.3707,  0.2677,  ..., -0.2835,  0.5303, -0.0569],
         [ 0.5455, -0.4865,  0.7949,  ..., -0.5479,  0.1502,  0.0675],
         [ 0.4508,  0.3423,  0.4286,  ..., -0.5160,  0.4282, -0.0413],
         [ 0.9906,  0.0174, -0.2410,  ..., -0.1005,  1.2535, -0.3444]]],
       grad_fn=<NativeLayerNormBackward>)


In [79]:
sentences = ['[MASK]']
input_ids = tokenizer(sentences, padding=True, return_tensors='pt')
print(input_ids)
lm_outputs = lm_model(**input_ids)
transformer_hidden_states = lm_outputs[-1]
print(transformer_hidden_states[-1].shape)
print(transformer_hidden_states[-1])

{'input_ids': tensor([[101, 103, 102]]), 'token_type_ids': tensor([[0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1]])}
torch.Size([1, 3, 768])
tensor([[[ 0.1323,  0.3435,  0.1365,  ...,  0.2833, -0.2909,  0.3727],
         [ 0.4390, -0.0905,  0.2218,  ..., -0.3375,  0.3586,  0.3006],
         [ 1.1752,  0.4498, -0.6960,  ...,  0.1995,  0.8212, -0.1582]]],
       grad_fn=<NativeLayerNormBackward>)


In [83]:
sentences = ['hello [MASK]']
input_ids = tokenizer(sentences, padding=True, return_tensors='pt')
print(input_ids)
lm_outputs = lm_model(**input_ids)
transformer_hidden_states = lm_outputs[-1]
print(transformer_hidden_states[-1].shape)
print(transformer_hidden_states[-1])

{'input_ids': tensor([[  101, 19082,   103,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
torch.Size([1, 4, 768])
tensor([[[ 0.4933,  0.3707,  0.2677,  ..., -0.2835,  0.5303, -0.0569],
         [ 0.5455, -0.4865,  0.7949,  ..., -0.5479,  0.1502,  0.0675],
         [ 0.4508,  0.3423,  0.4286,  ..., -0.5160,  0.4282, -0.0413],
         [ 0.9906,  0.0174, -0.2410,  ..., -0.1005,  1.2535, -0.3444]]],
       grad_fn=<NativeLayerNormBackward>)


In [37]:
sentences = ['hello world hello']
input_ids = tokenizer(sentences, padding=True, return_tensors='pt')
print(input_ids)
lm_outputs = lm_model(**input_ids)
transformer_hidden_states = lm_outputs[-1]
print(transformer_hidden_states[-1].shape)
print(transformer_hidden_states[-1])

{'input_ids': tensor([[18435,   995, 23748]]), 'attention_mask': tensor([[1, 1, 1]])}
torch.Size([1, 3, 1600])
tensor([[[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327],
         [-0.2217,  0.3650,  0.6309,  ..., -1.3968,  0.1588,  0.1691],
         [ 0.0775,  0.1162,  0.7933,  ..., -1.5378, -0.0938,  0.9396]]],
       grad_fn=<ViewBackward>)


In [195]:
sentences = ['Hello world there you']
input_ids = tokenizer(sentences, padding=True, return_tensors='pt')
print(input_ids)
lm_outputs = lm_model(**input_ids)
transformer_hidden_states = lm_outputs[-1]
print(transformer_hidden_states[-1].shape)
print(transformer_hidden_states[-1])

{'input_ids': tensor([[18435,   995,   612,   345]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
torch.Size([1, 4, 1600])
tensor([[[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327],
         [-0.2217,  0.3650,  0.6309,  ..., -1.3968,  0.1588,  0.1691],
         [ 0.4922,  0.5799,  0.3748,  ..., -1.2212,  0.4659, -0.0181],
         [ 1.1680, -0.2796, -0.2864,  ..., -1.5432,  0.7543, -0.8277]]],
       grad_fn=<ViewBackward>)


In [31]:
sentences = ['Hello world there you are']
input_ids = tokenizer(sentences, padding=True, return_tensors='pt')
print(input_ids)
lm_outputs = lm_model(**input_ids)
transformer_hidden_states = lm_outputs[-1]
print(transformer_hidden_states[-1].shape)
print(transformer_hidden_states[-1])

{'input_ids': tensor([[18435,   995,   612,   345,   389]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
torch.Size([1, 5, 1600])
tensor([[[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327],
         [-0.2217,  0.3650,  0.6309,  ..., -1.3968,  0.1588,  0.1691],
         [ 0.4922,  0.5799,  0.3748,  ..., -1.2212,  0.4659, -0.0181],
         [ 1.1680, -0.2796, -0.2864,  ..., -1.5432,  0.7543, -0.8277],
         [ 0.2596,  0.3958,  0.6677,  ..., -1.7543,  0.9028,  0.6392]]],
       grad_fn=<ViewBackward>)


In [32]:
sentences = ['world there you are high']
input_ids = tokenizer(sentences, padding=True, return_tensors='pt')
print(input_ids)
lm_outputs = lm_model(**input_ids)
transformer_hidden_states = lm_outputs[-1]
print(transformer_hidden_states[-1].shape)
print(transformer_hidden_states[-1])

{'input_ids': tensor([[ 995,  612,  345,  389, 1029]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
torch.Size([1, 5, 1600])
tensor([[[-0.4205,  0.6139,  1.0727,  ..., -4.7529,  1.2246, -0.4381],
         [-0.1484, -0.2284,  0.3114,  ..., -1.4831,  0.4668, -0.4868],
         [-0.0156, -0.6524,  0.0106,  ..., -1.2291,  0.0511, -0.2832],
         [-1.7830,  0.0059,  0.2923,  ..., -1.4245, -0.0341,  0.6478],
         [-0.9276, -0.2381, -0.5200,  ..., -0.9828,  0.4576,  0.4824]]],
       grad_fn=<ViewBackward>)


In [173]:
import torch.utils.data as data
dl = data.DataLoader(input_ids, batch_size=1)