In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import math
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cache_dir_location = "models/gpt2"

tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2", cache_dir=cache_dir_location)
model = GPT2LMHeadModel.from_pretrained("distilgpt2", cache_dir=cache_dir_location)
model.eval()
if torch.cuda.is_available():
    model.to("cuda")

In [3]:
def calculate_perplexity(text, model, tokenizer, device="cuda", max_length=1024):
    inputs = tokenizer(text, return_tensors="pt", truncation=False)
    input_ids = inputs["input_ids"][0]

    nlls = []
    stride = 512  # overlap slightly
    for i in range(0, len(input_ids), stride):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, len(input_ids))
        trg_len = end_loc - i

        input_ids_chunk = input_ids[begin_loc:end_loc].unsqueeze(0).to(device)
        target_ids = input_ids_chunk.clone()
        target_ids[:, :-trg_len] = -100  # Only predict the last trg_len tokens

        with torch.no_grad():
            outputs = model(input_ids_chunk, labels=target_ids)
            neg_log_likelihood = outputs.loss * trg_len

        nlls.append(neg_log_likelihood)

    ppl = torch.exp(torch.stack(nlls).sum() / len(input_ids))
    return ppl.item()

In [4]:
transcript_folder = "amicorpus"

for root, _, files in os.walk(transcript_folder):
    for file in files:
        if file.endswith(".txt"):
            txt_path = os.path.join(root, file)
            with open(txt_path, "r", encoding="utf-8") as f:
                text = f.read()

            try:
                ppl = calculate_perplexity(text, model, tokenizer)
                print(f"{file}: Perplexity = {ppl:.2f}")
            except Exception as e:
                print(f"Failed to process {file}: {e}")


Token indices sequence length is longer than the specified maximum sequence length for this model (2673 > 1024). Running this sequence through the model will result in indexing errors
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


ES2002a.Mix-Headset.txt: Perplexity = 34.98
ES2002b.Mix-Headset.txt: Perplexity = 34.42
ES2002c.Mix-Headset.txt: Perplexity = 33.80
ES2002d.Mix-Headset.txt: Perplexity = 35.58
ES2003a.Mix-Headset.txt: Perplexity = 45.75
ES2003b.Mix-Headset.txt: Perplexity = 43.60
ES2003c.Mix-Headset.txt: Perplexity = 40.78
ES2003d.Mix-Headset.txt: Perplexity = 47.92
ES2004a.Mix-Headset.txt: Perplexity = 30.34
ES2004b.Mix-Headset.txt: Perplexity = 44.28
ES2004c.Mix-Headset.txt: Perplexity = 45.27
ES2004d.Mix-Headset.txt: Perplexity = 39.20
ES2005a.Mix-Headset.txt: Perplexity = 75.52
ES2005b.Mix-Headset.txt: Perplexity = 53.19
ES2005c.Mix-Headset.txt: Perplexity = 48.65
ES2005d.Mix-Headset.txt: Perplexity = 58.87
ES2006a.Mix-Headset.txt: Perplexity = 30.55
ES2006b.Mix-Headset.txt: Perplexity = 36.73
ES2006c.Mix-Headset.txt: Perplexity = 46.31
ES2006d.Mix-Headset.txt: Perplexity = 43.05
ES2007a.Mix-Headset.txt: Perplexity = 44.99
ES2007b.Mix-Headset.txt: Perplexity = 41.95
ES2007c.Mix-Headset.txt: Perplex