# Pre-trained GPT-2 model end to end

## Objective

1. Explore Pre-trained GPT-2
2. Run input text - "cat sat on the" through Pre-trained GPT-2 and extract next token predictions

## Load Pre-trained GTP-2 model

In [1]:
from collections import OrderedDict
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%%capture

# Load pre-trained model and tokenizer
model_name = "gpt2"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_name)

# Force model to use CPU
model = model.to("cpu")
# Set the model to evaluation mode
model.eval()
## Disable Dropout layers to ensure deterministic outputs

In [3]:
print("Model and tokenizer loaded successfully.")
print(f"Model: {model}")

Model and tokenizer loaded successfully.
Model: GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


## Extract hidden state of each layer in the model

In [4]:
def get_all_hidden_states(model, input_ids, hidden=False):
    with torch.no_grad():
        # Request all hidden states from each layer
        transformer_outputs = model.transformer(input_ids, output_hidden_states=hidden)
        print(f"Number of hidden state layers: {len(transformer_outputs)}")

        if hidden:
            all_hidden_states = transformer_outputs.hidden_states
            print(f"Number of hidden state layers: {len(all_hidden_states)}")

    return all_hidden_states if hidden else transformer_outputs


## Set custom LM-head

- Extract logits
- Apply temperature and k-top sampling
- Select final token predictions

In [5]:
def lm_head(model, last_token_context_vector, temperature=0.7, k=50):


    # 1. Get Logits from the Language Modeling Head
    lm_head = model.get_output_embeddings()  #model.lm_head
    logits = lm_head(last_token_context_vector)

    # Implement Temperature Scaling
    # Lower temp -> more confident, less random. Higher temp -> more random, creative.
    scaled_logits = logits / temperature

    # Implement Top-K Sampling
    # We limit the sampling pool to the top 'k' most likely tokens
    top_k_logits, top_k_indices = torch.topk(scaled_logits, k)

    # Create a new tensor filled with a very low value (-inf)
    filtered_logits = torch.full_like(logits, -float("Inf"))
    # Scatter the top-k logits back into the new tensor at their original positions (Scatter the values along dimension 1)
    filtered_logits.scatter_(0, top_k_indices, top_k_logits)

    # Convert the filtered logits into a probability distribution
    probabilities = F.softmax(filtered_logits, dim=-1)
    return probabilities


In [6]:
## Select top-n tokens from the probabilities calculated in the LM-head

def get_top_token(probabilities, tokenizer, top_n=1):

    # Sample one token from the final probability distribution
    # torch.multinomial is used for sampling from a discrete probability distribution.
    final_token_id = torch.multinomial(probabilities, num_samples=top_n)

    # Decode the selected token ID(s) to get the final word
    final_token = tokenizer.decode(final_token_id.tolist())

    print(f"\n✅ Final Selected {top_n} Tokens: '{final_token}'")
    return final_token


## Run experiment
### Run custom LM-head and extract top 5 predictions

In [7]:
input_text = "cat sat on the"

# Tokenize input text
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
print(f"Input prompt: '{input_text}'")

all_token_hidden_states = get_all_hidden_states(model, input_ids)
last_token_rep = all_token_hidden_states[0][-1, -1, :]

probabilities = lm_head(model, last_token_rep)
top_tokens = get_top_token(probabilities, tokenizer, top_n=5)

Input prompt: 'cat sat on the'
Number of hidden state layers: 2

✅ Final Selected 5 Tokens: ' bench ground floor bed edge'


### Next token prediction (via LM-head defined in the model)

In [8]:
with torch.no_grad():
    outputs = model(input_ids)
    print(f"\nModel Output length: {len(outputs)}")
    print(f"\nModel Output Keys: {outputs.keys()}")
    print(f"\nModel Output Logits Shape: {outputs.logits.shape}")
    print(f"\nModel Output last token Logits Shape: {outputs.logits[-1, -1, :].shape}")



Model Output length: 2

Model Output Keys: odict_keys(['logits', 'past_key_values'])

Model Output Logits Shape: torch.Size([1, 4, 50257])

Model Output last token Logits Shape: torch.Size([50257])


In [9]:
# Get the logits for the last token position
next_token_logits = outputs.logits[-1, -1, :]

# Convert logits to probabilities
probs = torch.softmax(next_token_logits, dim=-1)

# Get the most probable next token
predicted_token_id = torch.argmax(probs).item()
predicted_token = tokenizer.decode([predicted_token_id])

print(f"Next token prediction (from default lm-head): '{predicted_token}'")

Next token prediction (from default lm-head): ' floor'
