In [21]:
import torch
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
import numpy as np

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

## First let's prepare a tokenized input with GPT2Tokenizer

In [120]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Encode some inputs
text_1 = "Do not tell him about the company secrets. Or my mum. Because I have an affair with my mum. "
text_2 = "Do not tell about my mum"
indexed_tokens_1 = tokenizer.encode(text_1)
indexed_tokens_2 = tokenizer.encode(text_2)

# Convert inputs to PyTorch tensors
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
tokens_tensor_2 = torch.tensor([indexed_tokens_2])

INFO:pytorch_pretrained_bert.tokenization_gpt2:loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /home/vili/.pytorch_pretrained_bert/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
INFO:pytorch_pretrained_bert.tokenization_gpt2:loading merges file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at /home/vili/.pytorch_pretrained_bert/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda


In [113]:
print(tokens_tensor_1.shape)
print(tokens_tensor_2.shape)

torch.Size([1, 9])
torch.Size([1, 6])


## Let's see how to use GPT2Model to get hidden states

In [114]:
# Load pre-trained model (weights)
model = GPT2Model.from_pretrained('gpt2')
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor_1 = tokens_tensor_1 #.to('cuda')
tokens_tensor_2 = tokens_tensor_2 #.to('cuda')
#model.to('cuda')

# Predict hidden states features for each layer
with torch.no_grad():
    hidden_states_1, past = model(tokens_tensor_1)
    # past can be used to reuse precomputed hidden state in a subsequent predictions
    # (see beam-search examples in the run_gpt2.py example).
    hidden_states_2, past = model(tokens_tensor_2, past=past)

INFO:pytorch_pretrained_bert.modeling_gpt2:loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin from cache at /home/vili/.pytorch_pretrained_bert/4295d67f022061768f4adc386234dbdb781c814c39662dd1662221c309962c55.778cf36f5c4e5d94c8cd9cefcf2a580c8643570eb327f0d4a1f007fab2acbdf1
INFO:pytorch_pretrained_bert.modeling_gpt2:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at /home/vili/.pytorch_pretrained_bert/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.085d5f6a8e7812ea05ff0e6ed0645ab2e75d80387ad55c1ad9806ee70d272f80
INFO:pytorch_pretrained_bert.modeling_gpt2:Model config {
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "vocab_size": 50257
}



In [115]:
hidden_states_1.shape

torch.Size([1, 9, 768])

## And how to use GPT2LMHeadModel

In [121]:
# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor_1 = tokens_tensor_1 #.to('cuda')
tokens_tensor_2 = tokens_tensor_2 #.to('cuda')
#model.to('cuda')

# Predict all tokens
with torch.no_grad():
    predictions_1, past = model(tokens_tensor_1)
    # past can be used to reuse precomputed hidden state in a subsequent predictions
    # (see beam-search examples in the run_gpt2.py example).
    predictions_2, past = model(tokens_tensor_2, past=past)

# get the predicted last token
predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
predicted_token = tokenizer.decode([predicted_index])

INFO:pytorch_pretrained_bert.modeling_gpt2:loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin from cache at /home/vili/.pytorch_pretrained_bert/4295d67f022061768f4adc386234dbdb781c814c39662dd1662221c309962c55.778cf36f5c4e5d94c8cd9cefcf2a580c8643570eb327f0d4a1f007fab2acbdf1
INFO:pytorch_pretrained_bert.modeling_gpt2:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at /home/vili/.pytorch_pretrained_bert/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.085d5f6a8e7812ea05ff0e6ed0645ab2e75d80387ad55c1ad9806ee70d272f80
INFO:pytorch_pretrained_bert.modeling_gpt2:Model config {
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "vocab_size": 50257
}



## Understand the outputs

In [122]:
def print_preds(predictions, how_many = 10, verbose = False):
    for i in range(predictions.shape[1]):
        print(f"****** DIM {i} *********")
        top_10 = np.argpartition(predictions[0,i,:], -how_many)[-how_many:]
        if (verbose):
            print("the most probable was\n\t")
        print(tokenizer.decode([torch.argmax(predictions[0, i, :]).item()]), "\n")
        if (verbose):
            for ind in top_10:
                ind = ind.item()
                print(ind, "\t", tokenizer.decode([ind]), "\t\twith logprob\t", predictions[0,i,ind].item())


In [123]:
print("predicted was: ", predicted_index, "=", predicted_token, "\n")

print(text_1)
print_preds(predictions_1)


predicted was:  13 = . 

Do not tell him about the company secrets. Or my mum. Because I have an affair with my mum. 
****** DIM 0 *********
. 

****** DIM 1 *********
 be 

****** DIM 2 *********
 anyone 

****** DIM 3 *********
 that 

****** DIM 4 *********
 the 

****** DIM 5 *********
 fact 

****** DIM 6 *********
's 

****** DIM 7 *********
. 

****** DIM 8 *********
 He 

****** DIM 9 *********
 the 

****** DIM 10 *********
 personal 

****** DIM 11 *********
's 

****** DIM 12 *********
 Or 

****** DIM 13 *********
 I 

****** DIM 14 *********
'm 

****** DIM 15 *********
 a 

****** DIM 16 *********
 idea 

****** DIM 17 *********
 with 

****** DIM 18 *********
 a 

****** DIM 19 *********
 mum 

****** DIM 20 *********
. 

****** DIM 21 *********
 I 

****** DIM 22 *********
  



In [125]:
print(text_2)
print_preds(predictions_2, 5, True)

Do not tell about my mum
****** DIM 0 *********
the most probable was
	
 not 

1560 	  tell 		with logprob	 -24.417373657226562
314 	  I 		with logprob	 -24.355981826782227
340 	  it 		with logprob	 -24.340789794921875
407 	  not 		with logprob	 -19.683780670166016
345 	  you 		with logprob	 -21.48737144470215
****** DIM 1 *********
the most probable was
	
 tell 

1309 	  let 		with logprob	 -143.04208374023438
910 	  say 		with logprob	 -142.8894500732422
1561 	  talk 		with logprob	 -142.8363494873047
1560 	  tell 		with logprob	 -139.27957153320312
1265 	  ask 		with logprob	 -142.6316375732422
****** DIM 2 *********
the most probable was
	
 him 

2687 	  anyone 		with logprob	 -82.21723937988281
606 	  them 		with logprob	 -81.89225769042969
683 	  him 		with logprob	 -78.52623748779297
502 	  me 		with logprob	 -80.52213287353516
607 	  her 		with logprob	 -80.43750762939453
****** DIM 3 *********
the most probable was
	
 the 

607 	  her 		with logprob	 -78.84773254394531
340 	  

In [111]:
# how take the indeces of the biggest values
arr = np.array([1, 3, 2, 4, 5])
how_many = 3
indeces = np.argpartition(arr, - how_many) # small to big
indeces = indeces[-how_many:] # biggest
indeces

array([1, 3, 4])