# Alright, you've successfully built the GPT model. Now, to assess its performance, we'll explore the concept of perplexity as a metric.

In [1]:
! pip install transformers datasets

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-16.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB

In [2]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
## download GPT model
device = "cuda"
model_id = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
print(test["text"][3])

Downloading readme: 100%|██████████| 10.5k/10.5k [00:00<00:00, 34.7MB/s]
Downloading data: 100%|██████████| 733k/733k [00:00<00:00, 3.14MB/s]
Downloading data: 100%|██████████| 6.36M/6.36M [00:00<00:00, 23.8MB/s]
Downloading data: 100%|██████████| 657k/657k [00:00<00:00, 2.65MB/s]
Generating test split: 100%|██████████| 4358/4358 [00:00<00:00, 297937.72 examples/s]
Generating train split: 100%|██████████| 36718/36718 [00:00<00:00, 748902.00 examples/s]
Generating validation split: 100%|██████████| 3760/3760 [00:00<00:00, 449560.52 examples/s]

 Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy 's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . 






In [4]:
print(len(test))

4358


In [5]:
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt") # Since this dataset is small and we're just doing one forward pass over the set, we can just load and encode the entire dataset in memory.

Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors


In [6]:
import torch
from tqdm import tqdm

max_length = model.config.n_positions # For GPT-2 the max_length is 1024
stride = 512  # we will use 512 tokens as sliding-window size 
seq_len = encodings.input_ids.size(1) # seq_len of the WikiText-2. 

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)
        # loss is calculated using CrossEntropyLoss which averages over input tokens.
        # Multiply it with trg_len to get the summation instead of average.
        # We will take average over all the tokens to get the true average
        # in the last step of this example.
        neg_log_likelihood = outputs.loss * trg_len

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).sum() / end_loc)


100%|█████████▉| 560/562 [00:23<00:00, 23.71it/s]


In [7]:
print('The perplexity of GPT-2 on WikiText-2 is:', ppl.item())

The perplexity of GPT-2 on WikiText-2 is: 25.170446395874023


In [8]:
print(ppl)

tensor(25.1704, device='cuda:0')
