# Playground for Losses. Cross Entropy and Perplexity
* Calculation steps for Cross Entropy
* Calculation steps for Perplexity

## Imports and config

In [107]:
import torch
import torch.nn as nn
import tiktoken

torch.set_printoptions(threshold=10, edgeitems=3, precision=2)
torch.manual_seed(42)

# See these files for details
%run "01. DataPreparation.ipynb"
%run "06. GPTModel.ipynb"

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

# From 01. DataPreparation
create_dataloader = create_dataloader

# From 06. GPTModel
model = GPTModel(GPT_CONFIG_124M) # From 06. GPTModel.ipynb
model.eval();  # Disable dropout during inference
generate_tokens = generate_tokens # From 06. GPTModel.ipynb

## Creating Input and Target set for demonstration
* Generates test Batches Input + Targets

In [None]:
def get_test_batch(batch_size=2, context_length=6, stride=4, verbose=False):
    
    # Load sample text and create dataloader
    with open("00. Robins Small Text Sample.txt", "r", encoding="utf-8") as file:
        raw_text = file.read()
    
    tokenizer = tiktoken.get_encoding("gpt2")
    dataloader = create_dataloader(
        raw_text, 
        tokenizer_model_name="gpt2",
        batch_size=batch_size, 
        context_length=context_length, 
        stride=stride
    )
    
    # Get one batch of input and target data
    batch = next(iter(dataloader))
    inputs, targets = batch
    
    if verbose:
        print("Input shape:", inputs.shape)
        print("Target shape:", targets.shape)
        print("\nInput tokens:\n", inputs)
        print("\nTarget tokens:\n", targets)
        print("\nDecoded inputs:")
        for i in range(inputs.shape[0]):
            print(f"  [{i}]: {tokenizer.decode(inputs[i].tolist())}")
        print("\nDecoded targets:")
        for i in range(targets.shape[0]):
            print(f"  [{i}]: {tokenizer.decode(targets[i].tolist())}")
    
    return inputs, targets, tokenizer

if '__file__' not in dir():
    inputs, targets, tokenizer = get_test_batch(verbose=True)

Input shape: torch.Size([2, 8])
Target shape: torch.Size([2, 8])

Input tokens:
 tensor([[ 4504,   284,  3498,  ...,   611,  2147,   550],
        [  464,  1708,  2745,  ...,    13,   198, 23588]])

Target tokens:
 tensor([[  284,  3498,  5433,  ...,  2147,   550,  3022],
        [ 1708,  2745,   547,  ...,   198, 23588,   602]])

Decoded inputs:
  [0]:  returned to Lab 42 as if nothing had
  [1]: The following weeks were chaos.
Equ

Decoded targets:
  [0]:  to Lab 42 as if nothing had happened
  [1]:  following weeks were chaos.
Equations


## Untrained model logits output (single forward pass)
* Real output of the model are logits (unnormalized scores from output layer)
* After applying softmax we get a distribution of probabilites over the whole vocabulary (values sum to 1)
* Argmax on probabilities retuns most probable token_id (array index) for next token
* Model calculates logits for all input positions in parallel (Transformer architecture), but only the last position's logits are used for generation

In [109]:
def analyze_predictions(model, inputs, targets, tokenizer, verbose=False):
    
    with torch.no_grad():
        logits = model(inputs)

    probas = torch.softmax(logits, dim=-1)
    predicted_ids = torch.argmax(probas, dim=-1)
    
    if verbose:
        print("\nOutput shape (batch_size, num_tokens, vocab_size):", probas.shape)
        print("\nReturns logits for every partial sequence. Current end token as Q decodes next output\n"
              "In production only last logit is relevant. In training all can be used")
        print(f'\nProbability vector for next word after every of the {probas.shape[1]} input token (batch 0).\nIndex = Token_ID\n', probas[0])
        print("\nArgMax gives Index of vector = token_id with max value. Shape:", predicted_ids.shape)
        print("Token IDs (all batches):\n", predicted_ids)
        
        # Show predictions vs targets for each batch item
        for i in range(predicted_ids.shape[0]):
            print(f"\nBatch [{i}]:")
            input_tokens = inputs[i].tolist()
            predicted_tokens = predicted_ids[i].tolist()
            target_tokens = targets[i].tolist()
            
            for pos in range(len(input_tokens)):
                context = tokenizer.decode(input_tokens[:pos+1])
                predicted = tokenizer.decode([predicted_tokens[pos]])
                match = "✓" if predicted_tokens[pos] == target_tokens[pos] else "✗"
                print(f" '{context}'=>'{predicted}'[{predicted_tokens[pos]}] {match}")
    
    return logits, probas, predicted_ids

if '__file__' not in dir():
    logits, probas, predicted_ids = analyze_predictions(model, inputs, targets, tokenizer, verbose=True)


Output shape (batch_size, num_tokens, vocab_size): torch.Size([2, 8, 50257])

Returns logits for every partial sequence. Current end token as Q decodes next output
In production only last logit is relevant. In training all can be used

Probability vector for next word after every of the 8 input token (batch 0).
Index = Token_ID
 tensor([[2.50e-05, 1.65e-05, 1.18e-05,  ..., 1.23e-05, 7.70e-06, 1.23e-05],
        [1.33e-05, 2.51e-05, 1.23e-05,  ..., 7.07e-06, 1.28e-05, 8.43e-05],
        [1.66e-05, 7.78e-06, 7.39e-06,  ..., 1.56e-05, 3.74e-06, 1.09e-05],
        ...,
        [1.07e-05, 4.19e-06, 1.29e-05,  ..., 2.23e-05, 9.73e-06, 2.86e-05],
        [1.68e-05, 1.12e-05, 1.23e-05,  ..., 7.41e-06, 1.30e-05, 1.87e-05],
        [2.25e-05, 1.19e-05, 5.59e-06,  ..., 1.88e-05, 7.56e-06, 1.84e-05]])

ArgMax gives Index of vector = token_id with max value. Shape: torch.Size([2, 8])
Token IDs (all batches):
 tensor([[37723, 34784, 30389,  ..., 49327, 30327, 30257],
        [43794, 24782, 16450,  

# Training Goal and Cross Entropy Loss
* Goal: Maximize probability of correct target tokens (bring `probas[target_id]` close to 1.0)
* Logarithms convert products to sums and prevent numerical underflow, making optimization easier
* Cross-Entropy Loss = negative average log probability of target tokens: `-mean(log(probas[target_id]))`
* Loss can be computed at different granularities:
    * Per-token: `-log(P(y_i))` for individual predictions (batch_size, context_length)
    * Per-sequence: Average over tokens in each sequence (batch_size,)
    * Batch-average (common implemented here): Single scalar averaged over all predictions (N = batch_size × context_length)
* Formula: `Loss = -1/N * Σ log(P(y_i))`
    * `N` = total number of predictions being averaged over
    * `y_i` = the i-th target token (correct token at position i)
    * `P(y_i)` = model's predicted probability for target token y_i
    * `Σ` = sum over all N predictions
* Training typically uses batch-average loss for backpropagation
* Lower loss = model assigns higher probability to correct tokens

In [110]:
def extract_target_log_probas(probas, targets, tokenizer, verbose=False):
    # Compute logarithm of all token probabilities
    log_probas = torch.log(probas)
    
    # Extract log probabilities only for target tokens
    batch_size, num_tokens = targets.shape
    target_log_probas = log_probas[torch.arange(batch_size).unsqueeze(1), torch.arange(num_tokens), targets]
    
    # Calculate the average log probability for target tokens
    avg_log_probas = torch.mean(target_log_probas)
    
    # Negative average log probability (cross-entropy loss)
    cross_entropy_loss = avg_log_probas * -1
    
    if verbose:
        print("Log of probabilities (batch 0):\n", log_probas[0])
        print("\nLog probabilities for target tokens only:")
        print("Shape:", target_log_probas.shape)
        print(target_log_probas)
        print(f"\nAverage log probability: {avg_log_probas.item():.4f}")
        print(f"Negative average log probability (loss): {cross_entropy_loss.item():.4f}")
        
        # Show per-token breakdown
        print("\nPer-token log probabilities for targets:")
        for i in range(batch_size):
            print(f"\nBatch [{i}]:")
            for pos in range(num_tokens):
                target_token = targets[i, pos].item()
                log_prob = target_log_probas[i, pos].item()
                target_text = tokenizer.decode([target_token])
                print(f"  Position {pos}: target '{target_text}' [{target_token}] -> log_prob: {log_prob:.4f}")
    
    return target_log_probas, avg_log_probas, cross_entropy_loss

if '__file__' not in dir():
    target_log_probas, avg_log_probas, cross_entropy_loss = extract_target_log_probas(probas, targets, tokenizer, verbose=True)

Log of probabilities (batch 0):
 tensor([[-10.60, -11.01, -11.35,  ..., -11.31, -11.77, -11.30],
        [-11.23, -10.59, -11.30,  ..., -11.86, -11.26,  -9.38],
        [-11.01, -11.76, -11.82,  ..., -11.07, -12.50, -11.43],
        ...,
        [-11.44, -12.38, -11.25,  ..., -10.71, -11.54, -10.46],
        [-10.99, -11.40, -11.31,  ..., -11.81, -11.25, -10.89],
        [-10.70, -11.34, -12.09,  ..., -10.88, -11.79, -10.90]])

Log probabilities for target tokens only:
Shape: torch.Size([2, 8])
tensor([[ -9.83, -12.11, -11.36,  ..., -11.29, -10.38, -11.14],
        [-10.17, -10.86, -10.67,  ...,  -9.97, -11.46, -11.59]])

Average log probability: -10.8851
Negative average log probability (loss): 10.8851

Per-token log probabilities for targets:

Batch [0]:
  Position 0: target ' to' [284] -> log_prob: -9.8300
  Position 1: target ' Lab' [3498] -> log_prob: -12.1084
  Position 2: target ' 42' [5433] -> log_prob: -11.3566
  Position 3: target ' as' [355] -> log_prob: -11.1432
  Position 

## Use Pytorch Calculation function
* Takes care of softmax, log-probability etc.
* Batch dimensions need to be flattened

In [111]:
# Use Pytorch Calculation function
if '__file__' not in dir():
    # Manual calculation
    _, _, manual_loss = extract_target_log_probas(probas, targets, tokenizer, verbose=False)
    
    # PyTorch CrossEntropyLoss
    # Flatten: (batch, context, vocab) -> (batch*context, vocab) and (batch, context) -> (batch*context)
    logits_flat = logits.flatten(0, 1)  # Flatten first 2 dims (start, end)
    targets_flat = targets.flatten()     # Flatten all dims
    pytorch_loss = nn.CrossEntropyLoss()(logits_flat, targets_flat)
    
    print(f"Manual cross-entropy loss: {manual_loss.item():.4f}")
    print(f"PyTorch cross-entropy loss: {pytorch_loss.item():.4f}")

Manual cross-entropy loss: 10.8851
PyTorch cross-entropy loss: 10.8851


## Perplexity
* Simply exponential of cross-entropy loss
* Considered more interpratable 
    * Can be understood as en estimate of which numbers of vocabulary a model is uncertain about
    * How well the probability predicted by model matches actual distribution in words of dataset
    * Lower perplexity -> better model predictions

In [112]:
if '__file__' not in dir():
    perplexity = torch.exp(manual_loss)
    print("Perplexity: ", perplexity)
    print("Vocabulary: ", GPT_CONFIG_124M["vocab_size"])

Perplexity:  tensor(53377.07)
Vocabulary:  50257
