## Generative Models with GPT-2

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import pandas as pd



#### Let's play with gpt2 and gpt2-xl
Note that we can use the Auto functions

In [2]:
model_name = 'gpt2'
model_name = 'gpt2-xl'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [3]:
seq = "Machine learning with PyTorch can do amazing"
print("\nInput sequence: ")
print(seq)


Input sequence: 
Machine learning with PyTorch can do amazing


In [4]:
inputs = tokenizer(seq, return_tensors="pt").to(device)
print("\nTokenized input data structure: ")
print(inputs)


Tokenized input data structure: 
{'input_ids': tensor([[37573,  4673,   351,  9485, 15884,   354,   460,   466,  4998]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}


In [5]:
input_ids = inputs["input_ids"]  # just IDS, no attn mask
print("\nToken IDs and their words: ")
for id in input_ids[0]:
  word = tokenizer.decode(id)
  print(id, word)


Token IDs and their words: 
tensor(37573, device='cuda:0') Machine
tensor(4673, device='cuda:0')  learning
tensor(351, device='cuda:0')  with
tensor(9485, device='cuda:0')  Py
tensor(15884, device='cuda:0') Tor
tensor(354, device='cuda:0') ch
tensor(460, device='cuda:0')  can
tensor(466, device='cuda:0')  do
tensor(4998, device='cuda:0')  amazing


#### Now let's run it through the model

In [6]:
with torch.no_grad():
  logits = model(**inputs).logits[:, -1, :]
print("\nAll logits for next word: ")
print(logits)
print(logits.shape)


All logits for next word: 
tensor([[ 3.5669,  1.4470, -3.1922,  ..., -6.6968, -7.5917,  0.0593]],
       device='cuda:0')
torch.Size([1, 50257])


In [7]:
probs = torch.softmax(logits, dim=-1)
print("\nAll probabilities: ")
print(probs)


All probabilities: 
tensor([[2.5635e-05, 3.0775e-06, 2.9744e-08,  ..., 8.9411e-10, 3.6535e-10,
         7.6823e-07]], device='cuda:0')


In [8]:
pred_id = torch.argmax(logits).item()
pred_word = tokenizer.decode(pred_id)
pd.DataFrame([pred_id, logits[0, pred_id].cpu(), probs[0, pred_id].cpu(), pred_word], 
              index=['Token ID', 'Logits', 'Probability', 'Predicted Word'], columns =['Value'])

Unnamed: 0,Value
Token ID,1243
Logits,tensor(14.0662)
Probability,tensor(0.9303)
Predicted Word,things


### Let's look a bit closer

In [9]:
import pandas as pd

#input_txt = "Transformers are the"
input_txt = "Transformers are the "                    # This one is interesting to see how things change
#input_txt = "Transformers are built using the"         # Note the word pieces

input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
iterations = []
n_steps = 10
choices_per_step = 5

with torch.no_grad():
    for _ in range(n_steps):
        iteration = dict()
        iteration["Input"] = tokenizer.decode(input_ids[0])
        output = model(input_ids=input_ids)
        
        # Select logits of the first batch and the last token and apply softmax to get the probability
        next_token_logits = output.logits[0, -1, :]
        next_token_probs = torch.softmax(next_token_logits, dim=-1)
        sorted_ids = torch.argsort(next_token_probs, dim=-1, descending=True)
        
        # Store tokens with highest probabilities in our little table
        for choice_idx in range(choices_per_step):
            token_id = sorted_ids[choice_idx]
            token_prob = next_token_probs[token_id].cpu().numpy()
            token_choice = (
                f"{tokenizer.decode(token_id)} ({100 * token_prob:.2f}%)"
            )
            iteration[f"Choice {choice_idx+1}"] = token_choice
        iterations.append(iteration)

            
        # Append predicted next token to input
        input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=-1)

pd.DataFrame(iterations)

Unnamed: 0,Input,Choice 1,Choice 2,Choice 3,Choice 4,Choice 5
0,Transformers are the,(15.07%),ills (14.52%),________ (7.28%),icky (4.74%),_____ (4.51%)
1,Transformers are the,most (7.08%),ultimate (4.51%),original (2.48%),""" (1.87%)",main (1.79%)
2,Transformers are the most,popular (17.39%),common (5.30%),powerful (5.28%),famous (4.39%),successful (2.82%)
3,Transformers are the most popular,toy (10.29%),toys (6.68%),Transformers (6.46%),of (5.72%),and (5.04%)
4,Transformers are the most popular toy,line (50.22%),in (9.74%),of (7.60%),lines (5.07%),line (4.07%)
5,Transformers are the most popular toy line,in (36.55%),of (14.28%),", (5.79%)",on (3.88%),. (3.41%)
6,Transformers are the most popular toy line in,the (71.44%),history (7.64%),America (4.29%),Japan (2.48%),all (1.15%)
7,Transformers are the most popular toy line in...,world (68.34%),US (4.86%),history (3.50%),universe (3.18%),United (2.54%)
8,Transformers are the most popular toy line in...,. (32.81%),", (32.00%)",and (10.73%),( (2.07%),with (1.94%)
9,Transformers are the most popular toy line in...,(11.34%),They (10.70%),(8.73%),The (5.84%),I (4.05%)


### The generate method runs the transformer several steps

In [10]:
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
output = model.generate(input_ids, max_new_tokens=n_steps, do_sample=False)
print(tokenizer.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Transformers are the  most popular toy line in the world. 


In [11]:
max_length = 128
input_txt = """In a shocking finding, scientist discovered \
a herd of unicorns living in a remote, previously unexplored \
valley, in the Andes Mountains. Even more surprising to the \
researchers was the fact that the unicorns spoke perfect English.\n\n
"""
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
output_greedy = model.generate(input_ids, max_length=max_length, do_sample=False)
print(tokenizer.decode(output_greedy[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The researchers, from the University of California, Davis, and the University of Colorado, Boulder, were conducting a study on the Andean cloud forest, which is home to the rare species of cloud forest trees.


The researchers were surprised to find that the unicorns were able to communicate with each other, and even with humans.


The researchers were surprised to find that the unicorns were able


### Let's play around with sampling methods

In [12]:
# encode context the generation is conditioned on
input_ids = tokenizer.encode('I enjoy walking with my cute dog', return_tensors='pt').to(device)

In [13]:
# Some scoring functions

import torch.nn.functional as F

def log_probs_from_logits(logits, labels):
    logp = F.log_softmax(logits, dim=-1)
    logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
    return logp_label

def sequence_logprob(model, labels, input_len=0):
    with torch.no_grad():
        output = model(labels)
        log_probs = log_probs_from_logits(
            output.logits[:, :-1, :], labels[:, 1:])
        seq_log_prob = torch.sum(log_probs[:, input_len:])
    return seq_log_prob.cpu().numpy()

#### Greedy Search

In [14]:
# generate text until the output length (which includes the context length) reaches 50
greedy_output = model.generate(input_ids, max_length=50)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [15]:
logp = sequence_logprob(model, greedy_output, input_len=len(input_ids[0]))
print(tokenizer.decode(greedy_output[0]))
print(f"\nlog-prob: {logp:.2f}")

I enjoy walking with my cute dog, and I love to read. I'm a big fan of the Harry Potter series, and I'm a huge fan of the Harry Potter movies. I'm a huge fan of the Harry Potter books, and I

log-prob: -51.03


#### Beam Search

In [16]:
beam_output = model.generate(
    input_ids, 
    max_length=50, 
    num_beams=5, 
    early_stopping=True
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [17]:
logp = sequence_logprob(model, beam_output, input_len=len(input_ids[0]))
print(tokenizer.decode(beam_output[0]))
print(f"\nlog-prob: {logp:.2f}")

I enjoy walking with my cute dog, but I don't want to walk with him in the rain. I don't want to walk in the rain. I don't want to walk in the rain. I don't want to walk in the rain

log-prob: -31.37


In [18]:
# set no_repeat_ngram_size to 2
beam_output = model.generate(
    input_ids, 
    max_length=50, 
    num_beams=5, 
    no_repeat_ngram_size=2, 
    early_stopping=True
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [19]:
logp = sequence_logprob(model, beam_output, input_len=len(input_ids[0]))
print(tokenizer.decode(beam_output[0]))
print(f"\nlog-prob: {logp:.2f}")

I enjoy walking with my cute dog, but I don't want to walk with him in the rain."

"I'm not sure if it's a good idea to take my dog to the park," said one woman. "I think it

log-prob: -57.65


#### Sampling within the probability distribution of words

In [20]:
# set seed to reproduce results. Feel free to change the seed though to get different results
torch.random.manual_seed(42)

# activate sampling and deactivate top_k by setting top_k sampling to 0
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_k=0
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [21]:
logp = sequence_logprob(model, sample_output, input_len=len(input_ids[0]))
print(tokenizer.decode(sample_output[0]))
print(f"\nlog-prob: {logp:.2f}")

I enjoy walking with my cute dog, Sophia, and protagonist Alex Miller, and had the opportunity to see the original Mathers series with my son on the Auctus trail.

Trivia Edit

A prequel called Ashes was released

log-prob: -159.56


#### Change the temperature

In [22]:
# use temperature to decrease the sensitivity to low probability candidates
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_k=0, 
    temperature=0.7
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [23]:
logp = sequence_logprob(model, sample_output, input_len=len(input_ids[0]))
print(tokenizer.decode(sample_output[0]))
print(f"\nlog-prob: {logp:.2f}")

I enjoy walking with my cute dog and seeing the different types of plants close to me," said Deen, who lives in the Guadalupe Heights neighborhood. "I enjoyed it at the Festival of Trees, too."

The festival, which

log-prob: -98.73


#### Top-K sampling

In [24]:
# set top_k to 50
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_k=50
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [25]:
logp = sequence_logprob(model, sample_output, input_len=len(input_ids[0]))
print(tokenizer.decode(sample_output[0]))
print(f"\nlog-prob: {logp:.2f}")

I enjoy walking with my cute dog in the summer!

Is my place warm?

Please don't hesitate to contact me at any point if you have any questions about my furniture, or would like a quote on a new piece. I

log-prob: -85.71


#### Top-P sampling

In [26]:
# deactivate top_k sampling and sample only from 92% most likely words
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_p=0.92, 
    top_k=0
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [27]:
logp = sequence_logprob(model, sample_output, input_len=len(input_ids[0]))
print(tokenizer.decode(sample_output[0]))
print(f"\nlog-prob: {logp:.2f}")

I enjoy walking with my cute dog and cat, and biking. I never miss a moment of the summer, because you should feel good about your body and love the sun. When I am eating healthy, I never feel hungry. I find balance by

log-prob: -122.94


#### Combining Top-K and Top-P Sampling

In [31]:
# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model.generate(
    input_ids,
    do_sample=True, 
    max_length=50, 
    top_k=25, 
    top_p=0.95, 
    num_return_sequences=3
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [32]:
sample_outputs

tensor([[   40,  2883,  6155,   351,   616, 13779,  3290,   357,    64,  3498,
          5022,     8,   290, 13226,   262,  4167,   290,  5897,   286,   262,
          8222,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256],
        [   40,  2883,  6155,   351,   616, 13779,  3290,   319,   616,  2166,
         33179,    13,   383,  6232,   318,  2407,  5897,    13,   314,   423,
           587,  6155,  1363,   422,   670,   351,   616,  3290,   329,   262,
           938,  1178, 12513,   290,   523,  1290,    11,  2147,   503,   286,
           262,  8850,    13,   198,   198,    40,  2107,   287,   262,  1748],
        [   40,  2883,  6155,   351,   616, 13779,  3290,    11,   475,   314,
           836,   470,   588,  2491,    11,   780,   340,  1838,   616,  7405,
         19597,    13,   314,   588,  2491,   319,

In [33]:
print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
    logp = sequence_logprob(model, sample_outputs, input_len=len(input_ids[0]))
    sample_outputs = sample_outputs[1:, :]
    print(f"\nlog-prob: {logp:.2f}\n")

Output:
----------------------------------------------------------------------------------------------------
0: I enjoy walking with my cute dog (a Lab mix) and enjoying the peace and quiet of the forest.

log-prob: -502.72

1: I enjoy walking with my cute dog on my front porch. The neighborhood is quite quiet. I have been walking home from work with my dog for the last few nights and so far, nothing out of the ordinary.

I live in the city

log-prob: -159.52

2: I enjoy walking with my cute dog, but I don't like running, because it makes my legs sore. I like running on a treadmill and playing basketball, and I like walking on a bike. But I don't like being out in the sun

log-prob: -79.76

