In [1]:
import torch
from pprint import pprint
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import os

from utils import *

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\tianq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
print("PyTorch CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)

PyTorch CUDA available: True
CUDA version: 12.6


In [3]:
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [4]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [5]:
input = "The cat sat on the mat"
input_ids = tokenizer.encode(input, return_tensors='pt')
with torch.no_grad():
        outputs = model(input_ids)
probs = torch.softmax(outputs.logits, dim=-1)
probs.shape

torch.Size([1, 6, 50257])

In [6]:
-1 * np.log2(probs)

  -1 * np.log2(probs)


tensor([[[12.5820, 10.7411, 15.1673,  ..., 18.6831, 19.9233, 10.6083],
         [10.1866, 10.9002, 14.2638,  ..., 22.6555, 14.3161, 13.6603],
         [14.6027, 16.1091, 26.5289,  ..., 30.9486, 18.7936, 20.2484],
         [20.5522, 18.6787, 27.6044,  ..., 29.2884, 26.3446, 21.9959],
         [23.1821, 19.7404, 25.8828,  ..., 25.2227, 25.4650, 22.8360],
         [11.7230, 13.1833, 21.3001,  ..., 30.8406, 20.2315, 15.8648]]])

In [7]:
model = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model)
batch = to_tokens_and_logprobs(model, tokenizer, ["The cat sat on the mat", "The cat sat on the"])
batch[0]

Unnamed: 0,token,surprisal
0,cat,13.266438
1,sat,10.47286
2,on,2.158653
3,the,1.140176
4,mat,7.949978


In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [2]:
def to_tokens_and_logprobs(model, tokenizer, input_texts):
    # move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #model.to(device)

    input_ids = tokenizer(input_texts, padding="max_length", truncation=True, return_tensors="pt").input_ids#.to(device)
    outputs = model(input_ids)
    probs = torch.softmax(outputs.logits, dim=-1).detach()
    surprisals = -1 * np.log2(probs)

    # collect the probability of the generated token -- probability at index 0 corresponds to the token at index 1
    # input_ids.cpu().detach()
    surprisals = surprisals[:, :-1, :]
    input_ids = input_ids[:, 1:]
    gen_surprisals = torch.gather(surprisals, 2, input_ids[:, :, None]).squeeze(-1)

    # gather all the surprisals for the sequences into a neat table
    batch = []
    sentence_id = 0
    for input_sentence, input_surprisals in zip(input_ids, gen_surprisals):
        sentence = []
        for token, p in zip(input_sentence, input_surprisals):
            if token not in tokenizer.all_special_ids:
                sentence.append({
                    # "sentence_id": sentence_id,
                    "token": tokenizer.decode(token),
                    "surprisal": p.item()
                })
        batch.append(pd.DataFrame(sentence))
    return batch

In [3]:
files = os.listdir("../Generations")
texts=[]
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name, padding=True)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)

for filepath in files[:10]:
    with open("../Generations/" + filepath, 'r') as file:
        text = file.read()
    texts.append(text)

batch = to_tokens_and_logprobs(model, tokenizer, texts)
batch[0]

  surprisals = -1 * np.log2(probs)


Unnamed: 0,token,surprisal
0,The,5.118950
1,teenage,14.721128
2,actor,9.354509
3,is,4.089015
4,set,6.280989
...,...,...
214,the,0.316361
215,next,0.164720
216,five,3.079309
217,years,0.046524


In [None]:
for text in batch:
    print(text.shape)

(219, 2)
(839, 2)
(1023, 2)
(486, 2)
(157, 2)
(496, 2)
(647, 2)
(561, 2)
(174, 2)
(222, 2)


In [8]:
def UID_variance(text):
    N = text.shape[0]
    mu = text['surprisal'].mean()
    surprisals = text['surprisal']
    return ((surprisals - mu) ** 2).sum() / N

def UID_pairwise(text):
    N = text.shape[0]
    surprisals = text['surprisal']
    return (surprisals.diff() ** 2).sum() / (N - 1)
for text in batch:
    print(UID_variance(text), UID_pairwise(text))

14.642322754064937 34.29004496096813
7.86144779533585 15.725233124652565
9.443031910349875 19.42506240869066
11.484577992179283 23.46479639157494
12.4728244985312 23.10573174920876
12.487591018076383 22.9211969608712
10.594795901538907 21.91790469405356
8.55277134133881 17.235762894157862
16.409818469373388 35.68560934315286
13.219312761842966 25.676247881393657
