In [16]:
import math
import time
import torch
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import math
from tqdm.auto import tqdm

MODEL_NAME = "gpt2"
DATASET_NAME = "wikitext"
DATASET_CONFIG = "wikitext-2-raw-v1"
DEVICE = "cuda"
SEQ_LEN = 512
BATCH_SIZE = 16

In [17]:
def load_wikitext(tokenizer, ratio=0.5):
    dataset = load_dataset(DATASET_NAME, DATASET_CONFIG)
    text_texts = dataset['test']['text']
    full_text = "\n\n".join(text_texts[:int(len(text_texts)*ratio)])
    
    enc = tokenizer(full_text, return_tensors='pt')
    return enc["input_ids"][0]

In [18]:
def make_chunks(ids, seq_len):
    n_tokens = ids.size(0)
    n_chunks = n_tokens // seq_len
    trunc_size = n_chunks * seq_len
    ids = ids[:trunc_size]
    inputs = ids.view(n_chunks, seq_len)
    labels = inputs.clone()
    return inputs, labels

In [19]:
def compute_ppl(model, inputs, labels, device, batch_size):
    ds = TensorDataset(inputs, labels)
    dl = DataLoader(ds, batch_size=batch_size)
    
    total_nll = 0.0
    total_tokens = 0
    t0 = time.perf_counter()
    
    model.eval()
    with torch.no_grad():
        for batch_inputs, batch_labels in tqdm(dl, total=len(dl), desc="batches", leave=False):
            batch_inputs = batch_inputs.to(device)
            batch_labels = batch_labels.to(device)
            
            out = model(input_ids=batch_inputs, labels=batch_labels)
            loss = out.loss
            n_tokens_batch = batch_labels.numel()
            
            total_nll += loss.item() * n_tokens_batch
            total_tokens += n_tokens_batch
    
    t1 = time.perf_counter()
    elapsed = t1 - t0
    ppl = math.exp(total_nll / total_tokens)
    return ppl, elapsed

In [20]:
print(f"Loading wikitext")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)

print("Tokenizing dataset")
ids = load_wikitext(tokenizer, ratio=1.0)
print(f"Total tokens: {ids.size(0)}")

print("Creating chunks")
inputs, labels = make_chunks(ids, SEQ_LEN)

print("Computing perplexity")
ppl, elapsed = compute_ppl(model, inputs, labels, DEVICE, BATCH_SIZE)

print(f"Perplexity: {ppl:.2f}")
print(f"Elapsed time: {elapsed:.2f} seconds")

Loading wikitext
Tokenizing dataset


Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors


Total tokens: 287644
Creating chunks
Computing perplexity


batches:   0%|          | 0/36 [00:00<?, ?it/s]

Perplexity: 35.48
Elapsed time: 5.08 seconds


In [21]:
def compute_ar_ppl(model, inputs, labels, device, batch_size):
    ds = TensorDataset(inputs, labels)
    dl = DataLoader(ds, batch_size=batch_size)
    total_nll = 0.0
    total_tokens = 0
    latencies = []
    

In [22]:
model.config

GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "dtype": "float32",
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.57.1",
  "use_cache": true,
  "vocab_size": 50257
}