In [2]:
import math
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the tokenizer and model
model_name = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
model.eval()

def compute_perplexity(text: str) -> float:
    # Encode text
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"].to(model.device)

    # We will compute the loss over all tokens except the very first one, 
    # since the first token has no previous context
    with torch.no_grad():
        # Get model outputs
        outputs = model(input_ids, labels=input_ids)
        # outputs.loss is the mean cross-entropy loss over all tokens in the sequence
        # Multiplying by sequence length gives us the total negative log-likelihood
        neg_log_likelihood = outputs.loss * (input_ids.size(1))
    
    # Average negative log-likelihood per token
    avg_neg_log_likelihood = neg_log_likelihood.item() / input_ids.size(1)

    # Perplexity is exp of the average negative log-likelihood
    perplexity = math.exp(avg_neg_log_likelihood)
    return perplexity

# Example usage
sentence = "The quick brown fox jumps over the lazy dog."
ppl = compute_perplexity(sentence)
print(f"Perplexity: {ppl}")

ImportError: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>=0.26.0'`