# Playground for Losses. Cross Entropy and Perplexity
* Calculation steps for Cross Entropy
* Calculation steps for Perplexity

## Imports and config

In [106]:
import torch
import torch.nn as nn
import tiktoken

torch.set_printoptions(threshold=10, edgeitems=3, precision=2)
torch.manual_seed(42)

# See these files for details
%run "01. DataPreparation.ipynb"
%run "06. GPTModel.ipynb"

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

# From 01. DataPreparation
create_dataloader = create_dataloader

# From 06. GPTModel
model = GPTModel(GPT_CONFIG_124M) # From 06. GPTModel.ipynb
model.eval();  # Disable dropout during inference
generate_tokens = generate_tokens # From 06. GPTModel.ipynb

## Creating Input and Target Generation
* Generates test Batches Input + Targets

In [107]:
def get_test_batch(batch_size=2, context_length=4, stride=4, verbose=False):
    
    # Load sample text and create dataloader
    with open("00. Robins Small Text Sample.txt", "r", encoding="utf-8") as file:
        raw_text = file.read()
    
    tokenizer = tiktoken.get_encoding("gpt2")
    dataloader = create_dataloader(
        raw_text, 
        tokenizer_model_name="gpt2",
        batch_size=batch_size, 
        context_length=context_length, 
        stride=stride
    )
    
    # Get one batch of input and target data
    batch = next(iter(dataloader))
    inputs, targets = batch
    
    if verbose:
        print("Input shape:", inputs.shape)
        print("Target shape:", targets.shape)
        print("\nInput tokens:\n", inputs)
        print("\nTarget tokens:\n", targets)
        print("\nDecoded inputs:")
        for i in range(inputs.shape[0]):
            print(f"  [{i}]: {tokenizer.decode(inputs[i].tolist())}")
        print("\nDecoded targets:")
        for i in range(targets.shape[0]):
            print(f"  [{i}]: {tokenizer.decode(targets[i].tolist())}")
    
    return inputs, targets, tokenizer

if '__file__' not in dir():
    inputs, targets, tokenizer = get_test_batch(verbose=True)

Input shape: torch.Size([2, 4])
Target shape: torch.Size([2, 4])

Input tokens:
 tensor([[12925,  2250,   351,  1583],
        [ 3947,   284,  1950,   257]])

Target tokens:
 tensor([[2250,  351, 1583,   13],
        [ 284, 1950,  257,  649]])

Decoded inputs:
  [0]:  countless hours with Dr
  [1]:  seemed to suggest a

Decoded targets:
  [0]:  hours with Dr.
  [1]:  to suggest a new


## Untrained model logits output (single forward pass)
* Real output of the model are logits (unnormalized scores from output layer)
* After applying softmax we get a distribution of probabilites over the whole vocabulary (values sum to 1)
* Argmax on probabilities retuns most probable token_id (array index) for next token
* Model calculates logits for all input positions in parallel (Transformer architecture), but only the last position's logits are used for generation

In [108]:
def analyze_predictions(model, inputs, targets, tokenizer, verbose=False):
    
    with torch.no_grad():
        logits = model(inputs)

    probas = torch.softmax(logits, dim=-1)
    predicted_ids = torch.argmax(probas, dim=-1)
    
    if verbose:
        print("\nOutput shape (batch_size, num_tokens, vocab_size):", probas.shape)
        print("\nReturns logits for every partial sequence. Current end token as Q decodes next output\n"
              "In production only last logit is relevant. In training all can be used")
        print(f'\nProbability vector for next word after every of the {probas.shape[1]} input token (batch 0).\nIndex = Token_ID\n', probas[0])
        print("\nArgMax gives Index of vector = token_id with max value. Shape:", predicted_ids.shape)
        print("Token IDs (all batches):\n", predicted_ids)
        
        # Show predictions vs targets for each batch item
        for i in range(predicted_ids.shape[0]):
            print(f"\nBatch [{i}]:")
            input_tokens = inputs[i].tolist()
            predicted_tokens = predicted_ids[i].tolist()
            target_tokens = targets[i].tolist()
            
            for pos in range(len(input_tokens)):
                context = tokenizer.decode(input_tokens[:pos+1])
                predicted = tokenizer.decode([predicted_tokens[pos]])
                target = tokenizer.decode([target_tokens[pos]])
                match = "✓" if predicted_tokens[pos] == target_tokens[pos] else "✗"
                print(f"  '{context}' --> '{predicted}' [{predicted_tokens[pos]}] | target '{target}' [{target_tokens[pos]}] {match}")
    
    return logits, probas, predicted_ids

if '__file__' not in dir():
    logits, probas, predicted_ids = analyze_predictions(model, inputs, targets, tokenizer, verbose=True)


Output shape (batch_size, num_tokens, vocab_size): torch.Size([2, 4, 50257])

Returns logits for every partial sequence. Current end token as Q decodes next output
In production only last logit is relevant. In training all can be used

Probability vector for next word after every of the 4 input token (batch 0).
Index = Token_ID
 tensor([[1.92e-05, 4.03e-05, 1.61e-05,  ..., 2.57e-05, 2.05e-05, 1.55e-05],
        [1.62e-05, 1.89e-05, 1.14e-05,  ..., 6.62e-06, 1.93e-05, 6.95e-05],
        [1.81e-05, 2.26e-05, 1.77e-05,  ..., 1.26e-05, 9.82e-06, 1.43e-05],
        [1.85e-05, 4.70e-05, 1.46e-05,  ..., 1.16e-05, 1.28e-05, 1.95e-05]])

ArgMax gives Index of vector = token_id with max value. Shape: torch.Size([2, 4])
Token IDs (all batches):
 tensor([[28155, 31548, 18617, 10345],
        [21391, 24756, 43525, 26721]])

Batch [0]:
  ' countless' --> 'witz' [28155] | target ' hours' [2250] ✗
  ' countless hours' --> ' dummy' [31548] | target ' with' [351] ✗
  ' countless hours with' --> ' launc

# Training Goal and Cross Entropy Loss
* Goal: Maximize probability of correct target tokens (bring `probas[target_id]` close to 1.0)
* Logarithms convert products to sums and prevent numerical underflow, making optimization easier
* Cross-Entropy Loss = negative average log probability of target tokens: `-mean(log(probas[target_id]))`
* Loss can be computed at different granularities:
    * Per-token: `-log(P(y_i))` for individual predictions (batch_size, context_length)
    * Per-sequence: Average over tokens in each sequence (batch_size,)
    * Batch-average (common implemented here): Single scalar averaged over all predictions (N = batch_size × context_length)
* Formula: `Loss = -1/N * Σ log(P(y_i))`
    * `N` = total number of predictions being averaged over
    * `y_i` = the i-th target token (correct token at position i)
    * `P(y_i)` = model's predicted probability for target token y_i
    * `Σ` = sum over all N predictions
* Training typically uses batch-average loss for backpropagation
* Lower loss = model assigns higher probability to correct tokens

In [109]:
def extract_target_log_probas(probas, targets, tokenizer, verbose=False):
    # Compute logarithm of all token probabilities
    log_probas = torch.log(probas)
    
    # Extract log probabilities only for target tokens
    batch_size, num_tokens = targets.shape
    target_log_probas = log_probas[torch.arange(batch_size).unsqueeze(1), torch.arange(num_tokens), targets]
    
    # Calculate the average log probability for target tokens
    avg_log_probas = torch.mean(target_log_probas)
    
    # Negative average log probability (cross-entropy loss)
    cross_entropy_loss = avg_log_probas * -1
    
    if verbose:
        print("Log of probabilities (batch 0):\n", log_probas[0])
        print("\nLog probabilities for target tokens only:")
        print("Shape:", target_log_probas.shape)
        print(target_log_probas)
        print(f"\nAverage log probability: {avg_log_probas.item():.4f}")
        print(f"Negative average log probability (loss): {cross_entropy_loss.item():.4f}")
        
        # Show per-token breakdown
        print("\nPer-token log probabilities for targets:")
        for i in range(batch_size):
            print(f"\nBatch [{i}]:")
            for pos in range(num_tokens):
                target_token = targets[i, pos].item()
                log_prob = target_log_probas[i, pos].item()
                target_text = tokenizer.decode([target_token])
                print(f"  Position {pos}: target '{target_text}' [{target_token}] -> log_prob: {log_prob:.4f}")
    
    return target_log_probas, avg_log_probas, cross_entropy_loss

if '__file__' not in dir():
    target_log_probas, avg_log_probas, cross_entropy_loss = extract_target_log_probas(probas, targets, tokenizer, verbose=True)

Log of probabilities (batch 0):
 tensor([[-10.86, -10.12, -11.03,  ..., -10.57, -10.80, -11.08],
        [-11.03, -10.87, -11.38,  ..., -11.93, -10.86,  -9.57],
        [-10.92, -10.70, -10.94,  ..., -11.28, -11.53, -11.16],
        [-10.90,  -9.97, -11.14,  ..., -11.37, -11.27, -10.84]])

Log probabilities for target tokens only:
Shape: torch.Size([2, 4])
tensor([[-11.64, -11.54, -11.44, -10.46],
        [-10.09, -10.37, -11.06, -10.79]])

Average log probability: -10.9250
Negative average log probability (loss): 10.9250

Per-token log probabilities for targets:

Batch [0]:
  Position 0: target ' hours' [2250] -> log_prob: -11.6440
  Position 1: target ' with' [351] -> log_prob: -11.5365
  Position 2: target ' Dr' [1583] -> log_prob: -11.4386
  Position 3: target '.' [13] -> log_prob: -10.4596

Batch [1]:
  Position 0: target ' to' [284] -> log_prob: -10.0913
  Position 1: target ' suggest' [1950] -> log_prob: -10.3741
  Position 2: target ' a' [257] -> log_prob: -11.0634
  Position 3

## Use Pytorch Calculation function
* Takes care of softmax, log-probability etc.
* Batch dimensions need to be flattened

In [110]:
# Use Pytorch Calculation function
if '__file__' not in dir():
    # Manual calculation
    _, _, manual_loss = extract_target_log_probas(probas, targets, tokenizer, verbose=False)
    
    # PyTorch CrossEntropyLoss
    # Flatten: (batch, context, vocab) -> (batch*context, vocab) and (batch, context) -> (batch*context)
    logits_flat = logits.flatten(0, 1)  # Flatten first 2 dims (start, end)
    targets_flat = targets.flatten()     # Flatten all dims
    pytorch_loss = nn.CrossEntropyLoss()(logits_flat, targets_flat)
    
    print(f"Manual cross-entropy loss: {manual_loss.item():.4f}")
    print(f"PyTorch cross-entropy loss: {pytorch_loss.item():.4f}")

Manual cross-entropy loss: 10.9250
PyTorch cross-entropy loss: 10.9250


## Perplexity
* Simply exponential of cross-entropy loss
* Considered more interpratable 
    * Can be understood as en estimate of which numbers of vocabulary a model is uncertain about
    * How well the probability predicted by model matches actual distribution in words of dataset
    * Lower perplexity -> better model predictions

In [111]:
if '__file__' not in dir():
    perplexity = torch.exp(manual_loss)
    print("Perplexity: ", perplexity)
    print("Vocabulary: ", GPT_CONFIG_124M["vocab_size"])

Perplexity:  tensor(55550.56)
Vocabulary:  50257
