In [1]:
import json
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm
import re
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


Load model:

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
model = GPT2LMHeadModel.from_pretrained("gpt2-large")
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=3840, nx=1280)
          (c_proj): Conv1D(nf=1280, nx=1280)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=5120, nx=1280)
          (c_proj): Conv1D(nf=1280, nx=5120)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1280, out_features=50257, bias=False)
)

In [4]:
def clean_meaning(meaning):
    # keep only the fist comma-period-semicolon
    return re.split(r'[:,.]',meaning)[0].strip()


insert meaning inline:


In [5]:
def insert_meaning(sentence, homonym, meaning):
    # replace only first occurence, with word boundary
    pattern = r'\b' + re.escape(homonym) + r'\b'
    meaning = clean_meaning(meaning)
    return re.sub(pattern, f"[{meaning}] {homonym}", sentence, count=1)

Compute suprisal from GPT-2

In [6]:
def compute_suprisal(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    return outputs.loss.item()

Load dataset:

In [None]:
with open("data/train.json", 'r') as f:
    data = json.load(f)

#print(len(data))

In [None]:
results = []

# loop over all entries
for key, entry in tqdm(data.items()):
    homonym = entry['homonym']
    meaning = entry['judged_meaning']
    sentence = entry['sentence']
    precontext = entry.get("precontext", "")
    ending = entry.get("ending", "")


    # insert meaning line
    modified_sentence = insert_meaning(sentence, homonym, meaning)

    # Combine context
    full_text = " ".join([precontext, modified_sentence, ending]).strip()

    # Compute suprisal
    try:
        suprisal = compute_suprisal(full_text)
    except Exception as e:
        suprisal = None
        print("Error for entry {key}:{e}")

    results.append({

        "id" : key,
        "homonym" : homonym,
        "judge_meaning" : meaning,
        "sentence" : sentence,
        "modified_sentence" : modified_sentence,
        "suprisal" : suprisal,
        "average" : entry.get("average")

    })

df = pd.DataFrame(results)
df.to_csv("/semeval2026/data/suprisal_results_GPT2_large_dev.csv", index=False) # !please change path accordingly
print("Done! Resuls saved!")

  0%|          | 0/588 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
100%|██████████| 588/588 [02:27<00:00,  3.99it/s]

Done! Resuls saved!



