In [1]:
import torch
import numpy as np
import pandas as pd

from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


### Analyze gpt2 moddel and tokenizer

In [2]:
model_name = "gpt2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [3]:
seq = "Machine Learning with PyTorch can do amazing"
print("Input sequence:", seq)

Input sequence: Machine Learning with PyTorch can do amazing


In [4]:
inputs = tokenizer(seq, return_tensors="pt").to(device)
print("Tokenized input data structure: \n", inputs)

Tokenized input data structure: 
 {'input_ids': tensor([[37573, 18252,   351,  9485, 15884,   354,   460,   466,  4998]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [5]:
input_ids = inputs["input_ids"]  # just IDS, no attn mask
print("\nToken IDs and their words: ")
for id in input_ids[0]:
  word = tokenizer.decode(id)
  print(id, word)


Token IDs and their words: 
tensor(37573) Machine
tensor(18252)  Learning
tensor(351)  with
tensor(9485)  Py
tensor(15884) Tor
tensor(354) ch
tensor(460)  can
tensor(466)  do
tensor(4998)  amazing


### Predicting the next token

In [6]:
with torch.no_grad():
  outputs = model(**inputs)
  logits = outputs.logits[:, -1, :] # last token, pulling out all logits

print("Vocabulary Size:", logits.shape[-1])
print("Logits Shape:", logits.shape)
print("Logits:", logits)

Vocabulary Size: 50257
Logits Shape: torch.Size([1, 50257])
Logits: tensor([[-116.6087, -119.3603, -123.8558,  ..., -125.5929, -129.2048,
         -119.6724]])


In [7]:
probs = torch.softmax(logits, dim=-1)

print("Next Token:", torch.argmax(probs).item())
print("Decoded Word:", tokenizer.decode(torch.argmax(probs).item()))

print("Best Token Probability:", torch.max(probs))
print("Probabilities:", probs)

Next Token: 1243
Decoded Word:  things
Best Token Probability: tensor(0.8678)
Probabilities: tensor([[4.3801e-05, 2.7958e-06, 3.1197e-08,  ..., 5.4916e-09, 1.4827e-10,
         2.0463e-06]])


In [8]:
pred_id = torch.argmax(logits).item()
logit_value = logits[0, pred_id].cpu().item()
prob_value = probs[0, pred_id].cpu().item()
pred_word = tokenizer.decode(pred_id)
pd.DataFrame([pred_id, logit_value, prob_value, pred_word], 
              index=['Token ID', 'Logits', 'Probability', 'Predicted Word'], columns =['Value'])

Unnamed: 0,Value
Token ID,1243
Logits,-106.714706
Probability,0.867764
Predicted Word,things


### Let's look at logic for predicting next token

In [11]:
#input_txt = "Transformers are the "                    # This one is interesting to see how things change
#input_txt = "Transformers are built using the"         # Note the word pieces

input_txt = "Machine Learning with PyTorch can do amazing"

input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
iterations = []
n_steps = 10
choices_per_step = 5

with torch.no_grad():
    for _ in range(n_steps):
        iteration = dict()
        iteration["Input"] = tokenizer.decode(input_ids[0])
        output = model(input_ids=input_ids)
        
        # Select logits of the first batch and the last token and apply softmax to get the probability
        next_token_logits = output.logits[0, -1, :]
        next_token_probs = torch.softmax(next_token_logits, dim=-1)
        sorted_ids = torch.argsort(next_token_probs, dim=-1, descending=True)
        
        # Store tokens with highest probabilities in our little table
        for choice_idx in range(choices_per_step):
            token_id = sorted_ids[choice_idx]
            token_prob = next_token_probs[token_id].cpu().numpy()
            token_choice = (
                f"{tokenizer.decode(token_id)} ({100 * token_prob:.2f}%)"
            )
            iteration[f"Choice {choice_idx+1}"] = token_choice
        iterations.append(iteration)

            
        # Append predicted next token to input
        input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=-1)

pd.DataFrame(iterations)

Unnamed: 0,Input,Choice 1,Choice 2,Choice 3,Choice 4,Choice 5
0,Machine Learning with PyTorch can do amazing,things (86.78%),work (2.53%),stuff (2.08%),thing (0.58%),tasks (0.41%)
1,Machine Learning with PyTorch can do amazing t...,. (27.47%),with (16.45%),", (12.07%)",for (8.76%),in (4.32%)
2,Machine Learning with PyTorch can do amazing t...,\n (22.20%),It (13.30%),The (4.67%),I (4.01%),We (3.84%)
3,Machine Learning with PyTorch can do amazing t...,\n (99.79%),The (0.02%),I (0.01%),A (0.01%),In (0.01%)
4,Machine Learning with PyTorch can do amazing t...,The (6.22%),I (4.16%),This (3.38%),Py (2.76%),In (2.50%)
5,Machine Learning with PyTorch can do amazing t...,Py (6.27%),first (4.72%),goal (2.83%),Python (2.58%),main (2.25%)
6,Machine Learning with PyTorch can do amazing t...,Tor (96.67%),Py (0.50%),T (0.27%),tor (0.12%),Data (0.07%)
7,Machine Learning with PyTorch can do amazing t...,ch (99.68%),cher (0.09%),che (0.09%),cho (0.02%),ches (0.01%)
8,Machine Learning with PyTorch can do amazing t...,project (3.77%),library (3.36%),API (2.66%),framework (2.17%),team (1.98%)
9,Machine Learning with PyTorch can do amazing t...,is (32.44%),has (11.87%),was (6.17%),uses (2.95%),aims (2.53%)
