This notebook calculates the GPT-2 surprisal of each word in a stimuli with varied context window.

# Setup

In [1]:
import torch
import os

# local
from utils import load_config
config = load_config()
stimuli_dir = os.path.join(config['directories']['source_dir'], 'stimuli', 'text_with_wordlists')
output_dir = config['directories']['stimuli_dir']
device = torch.device("mps")

# # colab
# stimuli_dir = '.'
# output_dir = '.'
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")

Using device: mps


In [5]:
# Load stimuli
import os
import pandas as pd

story_name = "cable_spool_fort"
i = 0
text = []
while True:
    path = os.path.join(stimuli_dir, f"{story_name}_produced_{i}.txt")
    if not os.path.exists(path):
        break
    with open(path, 'r') as f:
        t = f.read()
    text.append(t)
    i += 1
text = ' '.join(text)
text

"The Cable Spool Fort by Bill Glover  Hey , Roy ?  shojen  What ?   You suck .  Chad said . He wished Roy wouldn't fall for that gag every time ,  get me a big rock , Roy .  Roy stooped to pick up a big , white caliche rock that looked like a dirty lump of chalk and handed it to Chad . Chad took the rock with disgust as Roy returned to staring at his shoes sempling . Chad was six , and at eight Roy should have rebute been teasing or ordering Chad around or something . But since his fall the year before , Roy had been shuffling and doing what he was told . Chad strained and pounded the rock into the iron hole until he could smell the dust . Several hated must staring brother after his weak not after tried prying two hadnt expression other mad think dust . It smelled like first grade . He wished he were there now , even if he did have to sit next to Roy . The rock was too stull big to fit into the iron hub of the big cable spool , so Chad leaned it against the tumbleweed and slid down of

In [3]:
# !pip install spacy -q
# !python -m spacy download en_core_web_sm

In [6]:
import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp(text)

In [7]:
# Load model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer
)
from tokenizers.processors import TemplateProcessing

tokenizer = AutoTokenizer.from_pretrained('gpt2', return_tensors="pt", use_fast=True)
model = AutoModelForCausalLM.from_pretrained('gpt2').to(device)

# Tokenize stimuli
tokens = []
for sent in doc.sents:
  tokens.append(tokenizer.bos_token_id)
  tokens.extend(tokenizer.encode(sent.text, add_special_tokens=False))

In [9]:
from tqdm.notebook import tqdm

In [10]:
# Word2token mapping
tokens_char = []
for i in tqdm(range(len(tokens))):
  tokens_char.append(tokenizer.decode(tokens[i]))

def clean_text(s):
  s = ''.join(s.split()).lower()
  return s

tokens_char_clean = [clean_text(s) for s in tokens_char]
text_clean = [clean_text(s) for s in text.split()]

word_bound = []
matched_words = []
working_word = ''
j = 0
start = 0
for i in tqdm(range(0, len(tokens))):
  if tokens[i] == tokenizer.bos_token_id or tokens[i] == tokenizer.eos_token_id:
    start = start + 1
    continue
  working_word += tokens_char_clean[i]
  if text_clean[j] == working_word:
    matched_words.append(working_word)
    end = i+1
    word_bound.append([start, end])
    start = end
    working_word = ''
    j += 1

assert len(matched_words) == len(word_bound)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/2488 [00:00<?, ?it/s]

  0%|          | 0/2488 [00:00<?, ?it/s]

# Inference

In [11]:
surprisals = []
batch_size = 16
tokenizer.pad_token = tokenizer.eos_token
token_ids = tokens

for i in tqdm(range(len(surprisals), len(token_ids), batch_size)):
    ids = [token_ids[i+j+1:i+j+1025] for j in range(min(batch_size, len(token_ids)-i))]
    batch_tokens = [token_ids[i+j:i+j+1024] for j in range(min(batch_size, len(token_ids)-i))]
    encoded = tokenizer.prepare_for_model(batch_tokens,add_special_tokens=False,return_attention_mask=False,return_tensors='pt',padding='max_length',max_length=1024)

    with torch.no_grad():
        logits = model(**encoded.to(model.device)).logits.detach()

    logprobs = logits - logits.logsumexp(-1, keepdim=True)

    for j, logprob in enumerate(logprobs):
      surprisal = (-logprob[range(min(1024, len(ids[j]))), ids[j]]).tolist()
      surprisals.append(surprisal)

  0%|          | 0/156 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [149]:
with open(os.path.join(output_dir, f'surprisals_{story_name}.txt'), 'w') as f:
  for surprisal in surprisals:
    f.write(','.join([str(surp) for surp in surprisal]) + '\n')

In [150]:
# colab: download
from google.colab import files
files.download(os.path.join(output_dir, f'surprisals_{story_name}.txt'))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [151]:
word2tokens = pd.DataFrame(word_bound, columns=['start', 'end'])
word2tokens['word'] = matched_words
word2tokens.to_csv(os.path.join(output_dir, f'word2tokens_{story_name}.csv'), index=False)
word2tokens

Unnamed: 0,start,end,word
0,1,2,the
1,2,3,cable
2,3,5,spool
3,5,6,fort
4,6,7,by
...,...,...,...
2023,2483,2484,it
2024,2484,2485,was
2025,2485,2486,all
2026,2486,2487,right


In [152]:
# colab: download
from google.colab import files
files.download(os.path.join(output_dir, f'word2tokens_{story_name}.csv'))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>