In [10]:
pip install torch torchvision

Collecting torch
  Downloading torch-2.10.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (31 kB)
Collecting torchvision
  Downloading torchvision-0.25.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Collecting setuptools (from torch)
  Downloading setuptools-81.0.0-py3-none-any.whl.metadata (6.6 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx>=2.5.1 (from torch)
  Downloading networkx-3.6.1-py3-none-any.whl.metadata (6.8 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting cuda-bindings==12.9.4 (from torch)
  Downloading cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (2.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch)

In [11]:
#lets run a full llm pipeline
# Input text → Tokenization → Converting to IDs → Model processing → Next token prediction → Token selection → Building the response
from transformers import AutoModelForCausalLM, AutoTokenizer
import os

save_directory = "./downloaded_model"  # Change this to your preferred path

# Create the directory if it doesn't exist
os.makedirs(save_directory, exist_ok=True)
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

Loading weights: 100%|██████████| 76/76 [00:00<00:00, 593.96it/s, Materializing param=transformer.wte.weight]            
[1mGPT2LMHeadModel LOAD REPORT[0m from: distilgpt2
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
transformer.h.{0, 1, 2, 3, 4, 5}.attn.bias | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [14]:
#load model and tokenizer from the saved directory
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)
# Load the model and tokenizer from the saved directory
tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForCausalLM.from_pretrained(save_directory)

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]
Loading weights: 100%|██████████| 76/76 [00:00<00:00, 312.94it/s, Materializing param=transformer.wte.weight]            


In [16]:
#prompt or input
prompt = "What is the capital of France?"

In [19]:
# Tokenize the input
tokens = tokenizer.tokenize(prompt)

print("Tokenization Result:")
for i, token in enumerate(tokens):
    print(f"Token {i+1}: '{token}'")

Tokenization Result:
Token 1: 'What'
Token 2: 'Ġis'
Token 3: 'Ġthe'
Token 4: 'Ġcapital'
Token 5: 'Ġof'
Token 6: 'ĠFrance'
Token 7: '?'


In [21]:
# convert tokens to input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print("\nInput IDs:", input_ids)
#model input tensor shape and type
import torch
input_tensor = torch.tensor([input_ids])
print("\nInput Tensor Shape:", input_tensor.shape)
print("Input Tensor Type:", input_tensor.dtype)


Input IDs: [2061, 318, 262, 3139, 286, 4881, 30]

Input Tensor Shape: torch.Size([1, 7])
Input Tensor Type: torch.int64


In [27]:
#model processing 
with torch.no_grad():
    outputs = model(input_tensor)
    next_token_logits = outputs.logits

#output logits shape and type
print("\nNext Token Logits Shape:", next_token_logits.shape)
print("\nNext Token Logits Shape:", next_token_logits[0].shape)
print("Next Token Logits Type:", next_token_logits.dtype)


Next Token Logits Shape: torch.Size([1, 7, 50257])

Next Token Logits Shape: torch.Size([7, 50257])
Next Token Logits Type: torch.float32


In [30]:
# We want the predictions for the last position (after "transforming")
next_token_logits = next_token_logits[0, -1, :]

# Convert logits to probabilities
next_token_probs = torch.softmax(next_token_logits, dim=0)

# Get the top 10 most likely tokens
top_k = 10
topk_probs, topk_indices = torch.topk(next_token_probs, top_k)

# Convert to lists for easier handling
topk_probs = topk_probs.detach().numpy()
topk_indices = topk_indices.detach().numpy()

# Get the corresponding tokens
topk_tokens = [tokenizer.decode([idx]) for idx in topk_indices]

print("Top 10 Predictions for Next Token:")
print("-" * 40)
print(f"{'Token':<15} {'ID':<8} {'Probability':<10}")
print("-" * 40)
for i in range(top_k):
    print(f"{repr(topk_tokens[i]):<15} {topk_indices[i]:<8} {topk_probs[i]*100:.2f}%")

Top 10 Predictions for Next Token:
----------------------------------------
Token           ID       Probability
----------------------------------------
'\n'            198      23.75%
'�'             447      6.55%
' It'           632      4.33%
' The'          383      4.09%
' I'            314      2.51%
' And'          843      2.51%
' What'         1867     2.28%
' Is'           1148     2.28%
' A'            317      1.47%
' In'           554      1.39%


In [31]:
#token selection (greedy)
predicted_token_id = torch.argmax(next_token_logits).item()
predicted_token = tokenizer.decode([predicted_token_id])
print(f"\nGreedy Predicted Next Token: '{predicted_token}' (ID:{predicted_token_id})")

# display token-selection candidates (top-k) and highlight the chosen token
print("\nToken Selection Candidates (Top-k):")
print("-" * 40)
print(f"{'Token':<15} {'ID':<8} {'Probability':<10} {'Selected'}")
print("-" * 40)
for i in range(top_k):
    is_selected = "*" if int(topk_indices[i]) == predicted_token_id else ""
    print(f"{repr(topk_tokens[i]):<15} {topk_indices[i]:<8} {topk_probs[i]*100:.2f}% {is_selected}")



Greedy Predicted Next Token: '
' (ID:198)

Token Selection Candidates (Top-k):
----------------------------------------
Token           ID       Probability Selected
----------------------------------------
'\n'            198      23.75% *
'�'             447      6.55% 
' It'           632      4.33% 
' The'          383      4.09% 
' I'            314      2.51% 
' And'          843      2.51% 
' What'         1867     2.28% 
' Is'           1148     2.28% 
' A'            317      1.47% 
' In'           554      1.39% 


In [32]:
#building the response (greedy)
response = prompt + predicted_token
print(f"\nGenerated Response (Greedy): '{response}'")

# show many possible responses from top-k next-token candidates
print("\nPossible Responses (Top-k):")
for i in range(top_k):
    candidate_response = prompt + topk_tokens[i]
    print(f"{i+1:>2}. {candidate_response}")



Generated Response (Greedy): 'What is the capital of France?
'

Possible Responses (Top-k):
 1. What is the capital of France?

 2. What is the capital of France?�
 3. What is the capital of France? It
 4. What is the capital of France? The
 5. What is the capital of France? I
 6. What is the capital of France? And
 7. What is the capital of France? What
 8. What is the capital of France? Is
 9. What is the capital of France? A
10. What is the capital of France? In


In [37]:
#step by step llm pipeline demonstration complete!
import numpy as np

def generate_step_by_step(prompt, max_new_tokens=5, temperature=0.7, top_k=5):
    """Generate text token by token with detailed output at each step"""
    # Start with the prompt
    current_text = prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    
    print(f"Starting prompt: '{prompt}'\n")
    
    # Generate new tokens one by one
    for i in range(max_new_tokens):
        print(f"--- Step {i+1}: Generating token #{len(prompt.split())+i+1} ---")
        
        # Get model predictions
        with torch.no_grad():
            outputs = model(input_ids)
        
        # Get next token logits (predictions for the next token)
        next_token_logits = outputs.logits[0, -1, :]
        
        # Apply temperature
        next_token_logits = next_token_logits / temperature
        
        # Get top-k token indices and their probabilities
        topk_probs, topk_indices = torch.topk(torch.softmax(next_token_logits, dim=0), top_k)
        
        # Print the top candidates
        print("\nTop candidates:")
        for j in range(top_k):
            token_id = topk_indices[j].item()
            token_text = tokenizer.decode([token_id])
            token_prob = topk_probs[j].item() * 100
            print(f"  {j+1}. '{token_text}' (ID: {token_id}, Probability: {token_prob:.2f}%)")
        
        # Renormalize probabilities for top-k
        topk_probs = topk_probs / topk_probs.sum()
        
        # Sample from top-k
        chosen_idx = np.random.choice(topk_indices.detach().numpy(), p=topk_probs.detach().numpy())
        chosen_token = tokenizer.decode([chosen_idx])
        chosen_token_raw = tokenizer.convert_ids_to_tokens([int(chosen_idx)])[0]
        
        print(f"\nSelected token: '{chosen_token}'")
        print(f"Selected token raw: {chosen_token_raw!r}")
        print(f"Selected token repr: {chosen_token!r}")
        
        # Update for next iteration
        next_token = torch.tensor([[chosen_idx]])
        input_ids = torch.cat([input_ids, next_token], dim=1)
        current_text += chosen_token
        
        print(f"Text so far: '{current_text}'")
        print(f"Text so far repr: {current_text!r}\n")
    
    print(f"Final generated text: '{current_text}'")
    return current_text

# Generate text step by step
prompt = "AI is Transforming"
final_text = generate_step_by_step(prompt, max_new_tokens=5, temperature=0.7, top_k=5)

Starting prompt: 'AI is Transforming'

--- Step 1: Generating token #4 ---

Top candidates:
  1. ' the' (ID: 262, Probability: 74.38%)
  2. ' a' (ID: 257, Probability: 2.00%)
  3. ' The' (ID: 383, Probability: 1.61%)
  4. ' and' (ID: 290, Probability: 1.54%)
  5. '.' (ID: 13, Probability: 1.38%)

Selected token: ' the'
Selected token raw: 'Ġthe'
Selected token repr: ' the'
Text so far: 'AI is Transforming the'
Text so far repr: 'AI is Transforming the'

--- Step 2: Generating token #5 ---

Top candidates:
  1. ' World' (ID: 2159, Probability: 10.41%)
  2. ' Internet' (ID: 4455, Probability: 3.91%)
  3. ' world' (ID: 995, Probability: 3.34%)
  4. ' U' (ID: 471, Probability: 3.12%)
  5. ' US' (ID: 1294, Probability: 2.03%)

Selected token: ' Internet'
Selected token raw: 'ĠInternet'
Selected token repr: ' Internet'
Text so far: 'AI is Transforming the Internet'
Text so far repr: 'AI is Transforming the Internet'

--- Step 3: Generating token #6 ---

Top candidates:
  1. '.' (ID: 13, Prob

In [39]:

# Function to generate text with different parameters
def generate_with_params(prompt, max_new_tokens=15, **params):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    
    # Set up generation parameters
    gen_params = {}
    if 'temperature' in params:
        gen_params['temperature'] = params['temperature']
    if 'top_k' in params:
        gen_params['top_k'] = params['top_k']
    if 'top_p' in params:
        gen_params['top_p'] = params['top_p']
    if 'do_sample' in params:
        gen_params['do_sample'] = params['do_sample']
    
    # Generate the output
    output_ids = model.generate(
        input_ids, 
        max_length=len(input_ids[0]) + max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
        **gen_params
    )
    
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Try different parameter combinations
params_to_try = [
    {'name': 'Greedy (no sampling)', 'params': {'do_sample': False}},
    {'name': 'Low Temperature (0.3)', 'params': {'temperature': 0.3, 'do_sample': True}},
    {'name': 'High Temperature (1.5)', 'params': {'temperature': 1.5, 'do_sample': True}},
    {'name': 'Top-k (5)', 'params': {'top_k': 5, 'do_sample': True}},
    {'name': 'Top-p (0.9)', 'params': {'top_p': 0.9, 'do_sample': True}},
    {'name': 'Balanced', 'params': {'temperature': 0.7, 'top_k': 50, 'top_p': 0.9, 'do_sample': True}}
]

# Generate and display results
print("Effect of Generation Parameters:\n")

for setting in params_to_try:
    output = generate_with_params(prompt, **setting['params'])
    generated_part = output[len(prompt):]
    
    print(f"{setting['name']}")
    print(f"Parameters: {setting['params']}")
    print(f"Input: {prompt}")
    print(f"Generated: {generated_part}")
    print("-" * 80)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Effect of Generation Parameters:

Greedy (no sampling)
Parameters: {'do_sample': False}
Input: AI is Transforming
Generated:  the World.












--------------------------------------------------------------------------------
Low Temperature (0.3)
Parameters: {'temperature': 0.3, 'do_sample': True}
Input: AI is Transforming
Generated:  the world, and the world is changing.






--------------------------------------------------------------------------------
High Temperature (1.5)
Parameters: {'temperature': 1.5, 'do_sample': True}
Input: AI is Transforming
Generated:  the 'Reduce' as per a model


Categorized
--------------------------------------------------------------------------------
Top-k (5)
Parameters: {'top_k': 5, 'do_sample': True}
Input: AI is Transforming
Generated:  the world.››
The world is not the same as
--------------------------------------------------------------------------------
Top-p (0.9)
Parameters: {'top_p': 0.9, 'do_sample': True}
Input: AI is Transformin