# Playground for Losses. Cross Entropy and Perplexity
* Calculation steps for Cross Entropy
* Calculation steps for Perplexity

## Imports and config

In [22]:
import torch
import torch.nn as nn
import tiktoken

torch.set_printoptions(threshold=10, edgeitems=3)
torch.manual_seed(42)

# See these files for details
%run "06. GPTModel.ipynb"

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

model = GPTModel(GPT_CONFIG_124M) # From 06. GPTModel.ipynb
model.eval();  # Disable dropout during inference

generate_tokens = generate_tokens # From 06. GPTModel.ipynb

## Translation helper

In [23]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

## Untrained output
* Generates max 10 new tokens based on input content batch of 2

In [24]:
if '__file__' not in dir(): 
    # Batch of 2 texts, both with 4 tokens each
    tokenizer = tiktoken.get_encoding("gpt2")
    
    # Create batch by stacking individual text encodings
    text1 = text_to_token_ids("every effort moves", tokenizer)  # 3 tokens
    text2 = text_to_token_ids("I really like", tokenizer)        # 3 tokens
    
    # Stack into batch (2, 3)
    inputs = torch.cat([text1, text2], dim=0)
    
    print("Input shape ( batch x token_ids ):", inputs.shape)
    print("Input tokens:\n", inputs)

    # Generate tokens for batch
    token_ids = generate_tokens(
        model=model,
        token_IDs=inputs,
        max_new_tokens=10,
        context_size=GPT_CONFIG_124M["context_length"]
    )

    print("\nOutput shape: ", token_ids.shape)
    
    # Decode each batch item using token_ids_to_text
    print("\nBatch outputs: ")
    for i in range(token_ids.shape[0]):
        text = token_ids_to_text(token_ids[i:i+1], tokenizer)
        print(f"  [{i}]: {text}")

Input shape ( batch x token_ids ): torch.Size([2, 3])
Input tokens:
 tensor([[16833,  3626,  6100],
        [   40,  1107,   588]])

Output shape:  torch.Size([2, 13])

Batch outputs: 
  [0]: every effort moves HO Receiver latent Mitt unable cycling denote Python clears formations
  [1]: I really like energ opacity Labor Fen!] lichAlien adequately: membership


## Real model logits output
* Real output of the model are logits (unnormalized scores from output layer)
* After applying softmax we get a distribution of probabilites over the whole vocabulary
* Argmax on probabilities retuns most probable token_id (array index) for next token

In [28]:
if '__file__' not in dir(): 
    with torch.no_grad():
        logits = model(inputs)

    probas = torch.softmax(logits, dim=-1) # Probability of each token in vocabulary
    print("\nOutput shape (batch_size, num_tokens, vocab_size): ", probas.shape) 
    print(probas)

    token_ids = torch.argmax(probas, dim=-1)
    print("\nArgMax -> Token IDs shape:", token_ids.shape)
    print("Token IDs:\n", token_ids)

    # Show predictions for each batch item
    for i in range(token_ids.shape[0]):
        print(f"\nBatch [{i}]:")
        input_tokens = inputs[i].tolist()
        predicted_tokens = token_ids[i].tolist()
        
        for pos in range(len(input_tokens)):
            # Context: all tokens up to and including current position
            context = tokenizer.decode(input_tokens[:pos+1])
            predicted = tokenizer.decode([predicted_tokens[pos]])
            predicted_id = predicted_tokens[pos]
            print(f"  '{context}' predicts -> '{predicted}' [{predicted_id}]")


Output shape (batch_size, num_tokens, vocab_size):  torch.Size([2, 3, 50257])
tensor([[[6.7266e-05, 2.2964e-05, 6.6790e-06,  ..., 9.3710e-06,
          2.8003e-05, 1.3810e-05],
         [3.7238e-05, 1.5398e-05, 9.9722e-06,  ..., 6.6796e-06,
          1.2250e-05, 6.4735e-05],
         [1.9261e-05, 1.1219e-05, 2.4156e-05,  ..., 1.5356e-05,
          4.5120e-06, 3.8627e-05]],

        [[2.2608e-05, 2.3807e-05, 1.7500e-05,  ..., 2.0317e-05,
          2.4596e-05, 3.4503e-05],
         [1.3748e-05, 2.5611e-05, 9.0395e-06,  ..., 1.1674e-05,
          2.4658e-05, 7.0392e-05],
         [4.1235e-05, 6.7255e-06, 1.4985e-05,  ..., 7.6617e-06,
          7.1792e-06, 6.0584e-06]]])

ArgMax -> Token IDs shape: torch.Size([2, 3])
Token IDs:
 tensor([[24851,   406, 40115],
        [29716, 40825, 19647]])

Batch [0]:
  'every' predicts -> 'etti' [24851]
  'every effort' predicts -> ' L' [406]
  'every effort moves' predicts -> ' HO' [40115]

Batch [1]:
  'I' predicts -> 'ovsky' [29716]
  'I really' pred