# GUM UID Exploration (Minimal)

In [64]:
from pathlib import Path
from conllu import parse_incr

from src.uid import load_lm, compute_surprisal, uid_metrics, build_context
from src.units.sentence import PassiveSentence
from src.utils import render_tree

In [65]:
model_name = "gpt2"  # change to "distilgpt2" for faster runs

tokenizer, model, device = load_lm(model_name=model_name)

gum_path = Path("data/en_gum-ud-train.conllu")

Loading weights: 100%|██████████| 148/148 [00:00<00:00, 2469.96it/s, Materializing param=transformer.wte.weight]             
GPT2LMHeadModel LOAD REPORT from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [66]:
_docs = {}
_idx = {}
with gum_path.open("r", encoding="utf-8") as f:
    doc_id = None
    for s in parse_incr(f):
        m = s.metadata or {}
        if "newdoc id" in m:
            doc_id = m["newdoc id"]
            _docs.setdefault(doc_id, [])
        if doc_id is None:
            continue
        text = m.get("text") or " ".join(t["form"] for t in s if isinstance(t.get("id"), int))
        _docs[doc_id].append(text)
        if "sent_id" in m:
            _idx[m["sent_id"]] = (doc_id, len(_docs[doc_id]) - 1, s)

In [67]:
# Precompute passive sentences dataframe
import pandas as pd

passive_rows = []
for sid, (doc_id, i, sent) in _idx.items():
    try:
        ps = PassiveSentence(sent)
    except ValueError:
        continue
    passive_text = sent.metadata.get("text") or " ".join(t["form"] for t in sent if isinstance(t.get("id"), int))
    active_text = ps.depassivize().text
    passive_rows.append({
        "sid": sid,
        "doc_id": doc_id,
        "sent_idx": i,
        "passive": passive_text,
        "active": active_text,
    })

passives_df = pd.DataFrame(passive_rows)
len(passives_df)

139

In [68]:
passives_df.to_csv('passives.csv')

In [5]:
def uid_mean(text, context):
    _, surps = compute_surprisal(text, context, tokenizer, model, device=device)
    return uid_metrics(surps)["uid_mean"]

In [6]:
sorted(_idx)[:50]

['GUM_academic_art-1',
 'GUM_academic_art-10',
 'GUM_academic_art-11',
 'GUM_academic_art-12',
 'GUM_academic_art-13',
 'GUM_academic_art-14',
 'GUM_academic_art-15',
 'GUM_academic_art-16',
 'GUM_academic_art-17',
 'GUM_academic_art-18',
 'GUM_academic_art-19',
 'GUM_academic_art-2',
 'GUM_academic_art-20',
 'GUM_academic_art-21',
 'GUM_academic_art-22',
 'GUM_academic_art-23',
 'GUM_academic_art-24',
 'GUM_academic_art-25',
 'GUM_academic_art-26',
 'GUM_academic_art-27',
 'GUM_academic_art-28',
 'GUM_academic_art-3',
 'GUM_academic_art-4',
 'GUM_academic_art-5',
 'GUM_academic_art-6',
 'GUM_academic_art-7',
 'GUM_academic_art-8',
 'GUM_academic_art-9',
 'GUM_academic_census-1',
 'GUM_academic_census-10',
 'GUM_academic_census-11',
 'GUM_academic_census-12',
 'GUM_academic_census-13',
 'GUM_academic_census-14',
 'GUM_academic_census-15',
 'GUM_academic_census-16',
 'GUM_academic_census-17',
 'GUM_academic_census-18',
 'GUM_academic_census-19',
 'GUM_academic_census-2',
 'GUM_academic_

In [7]:
# sentence tree
sid = "GUM_academic_economics-10"
doc_id, i, sent = _idx[sid]
render_tree(sent)

In       ADP   <══╗                                     case
other    ADJ   <╗ ║                                     amod
words    NOUN  ═╝═╝═╗<════════════════════════════╗     obl
,        PUNCT <════╝                             ║     punct
it       PRON  <════════════════════════════════╗ ║     expl
is       AUX   <══════════════════════════════╗ ║ ║     cop
critical ADJ   ═════════════════════════════╗═╝═╝═╝═╗═╗ root
that     SCONJ <══════════════════════════╗ ║       ║ ║ mark
economic ADJ   <╗                         ║ ║       ║ ║ amod
agents   NOUN  ═╝<══════════════════════╗ ║ ║       ║ ║ nsubj
believe  VERB  ═══════════════════════╗═╝═╝<╝       ║ ║ csubj
that     SCONJ <════════════════════╗ ║             ║ ║ mark
their    PRON  <══╗                 ║ ║             ║ ║ nmod:poss
property NOUN  <╗ ║                 ║ ║             ║ ║ compound
rights   NOUN  ═╝═╝<══════════════╗ ║ ║             ║ ║ nsubj:pass
will     AUX   <════════════════╗ ║ ║ ║             ║ ║ aux
not      

In [8]:
# passive UID
sid1 = "GUM_academic_economics-10"
doc1, i1, sent1 = _idx[sid1]
passive1 = sent1.metadata.get("text")
uid1_p = uid_mean(passive1, "")
print(passive1, uid1_p, sep="\n")

In other words, it is critical that economic agents believe that their property rights will not be taken away by other public or private actors [15].
5.002239635278439


In [9]:
# active
active1 = PassiveSentence(sent1).depassivize().text
uid1_a = uid_mean(active1, "")
print(active1, uid1_a, sep="\n")

In other words, it is critical that economic agents believe that other public or private actors will not take their property rights away [15].
5.6124406534212605


In [10]:
# example 1 (prev1 context)
ctx1_prev1 = build_context(_docs[doc1], i1, mode="prev", k=1, tokenizer=tokenizer)
uid1_p_prev1 = uid_mean(passive1, ctx1_prev1)
uid1_a_prev1 = uid_mean(active1, ctx1_prev1)
print(uid1_p_prev1)
print(uid1_a_prev1)

4.645602260106083
5.140297444261335


In [11]:
# example 1 (doc context)
ctx1_doc = build_context(_docs[doc1], i1, mode="doc", tokenizer=tokenizer)
uid1_p_doc = uid_mean(passive1, ctx1_doc)
uid1_a_doc = uid_mean(active1, ctx1_doc)
print(uid1_p_doc)
print(uid1_a_doc)

4.308565437793732
4.715669128078002


In [12]:
# example 2 (sentence-only): passive UID
sid2 = "GUM_academic_mutation-11"
doc2, i2, sent2 = _idx[sid2]
passive2 = sent2.metadata.get("text")
uid2_p = uid_mean(passive2, "")
print(passive2, uid2_p, sep="\n")

The different mutants are generated automatically by the application of mutation operators.
7.48641195664039


In [13]:
# example 2 (sentence-only): active UID
active2 = PassiveSentence(sent2).depassivize().text
uid2_a = uid_mean(active2, "")
print(active2, uid2_a, sep="\n")

The application of mutation operators generates the different mutants automatically.
9.32248178395358


In [14]:
# example 2 (prev3 context)
ctx2_prev3 = build_context(_docs[doc2], i2, mode="prev", k=3, tokenizer=tokenizer)
uid2_p_prev3 = uid_mean(passive2, ctx2_prev3)
uid2_a_prev3 = uid_mean(active2, ctx2_prev3)
print(uid2_p_prev3)
print(uid2_a_prev3)

6.0848787530110435
8.085243444551121


In [15]:
# example 2 (doc context)
ctx2_doc = build_context(_docs[doc2], i2, mode="doc", tokenizer=tokenizer)
uid2_p_doc = uid_mean(passive2, ctx2_doc)
uid2_a_doc = uid_mean(active2, ctx2_doc)
print(uid2_p_doc)
print(uid2_a_doc)

5.007203772090948
6.569517829878763


In [16]:
# example 3 (sentence-only): passive UID
sid3 = "GUM_academic_epistemic-7"
doc3, i3, sent3 = _idx[sid3]
passive3 = sent3.metadata.get("text")
uid3_p = uid_mean(passive3, "")
print(passive3, uid3_p, sep="\n")

We live in an epistemic environment that is heavily and deliberately polluted by agents who use mimicry and other methods as a means of inflating their pretense to expertise.
5.576692832168192


In [17]:
# example 3 (sentence-only): active UID
active3 = PassiveSentence(sent3).depassivize().text
uid3_a = uid_mean(active3, "")
print(active3, uid3_a, sep="\n")

We live in an epistemic environment that agents who use mimicry and other methods as a means of inflating their pretense to expertise heavily and deliberately pollute.
6.4530542428943924


In [18]:
# example 3 (prev2 context)
ctx3_prev2 = build_context(_docs[doc3], i3, mode="prev", k=2, tokenizer=tokenizer)
uid3_p_prev2 = uid_mean(passive3, ctx3_prev2)
uid3_a_prev2 = uid_mean(active3, ctx3_prev2)
print(uid3_p_prev2)
print(uid3_a_prev2)

5.4846081564969875
6.319475277101226


In [19]:
# example 3 (doc context)
ctx3_doc = build_context(_docs[doc3], i3, mode="doc", tokenizer=tokenizer)
uid3_p_doc = uid_mean(passive3, ctx3_doc)
uid3_a_doc = uid_mean(active3, ctx3_doc)
print(uid3_p_doc)
print(uid3_a_doc)

5.024288416730569
6.034897722952971


In [20]:
# Random sample of 20 passives
import random
import pandas as pd

random.seed(7)

passive_sents = []
for sid, (doc_id, i, sent) in _idx.items():
    try:
        ps = PassiveSentence(sent)
    except ValueError:
        continue
    passive_text = sent.metadata.get("text") or " ".join(t["form"] for t in sent if isinstance(t.get("id"), int))
    active_text = ps.depassivize().text
    passive_sents.append((sid, doc_id, i, passive_text, active_text))

sample = random.sample(passive_sents, k=min(20, len(passive_sents)))
len(sample)

20

In [21]:
def ctx_none(doc_id, i):
    return ""

def ctx_prev1(doc_id, i):
    return build_context(_docs[doc_id], i, mode="prev", k=1, tokenizer=tokenizer)

def ctx_prev3(doc_id, i):
    return build_context(_docs[doc_id], i, mode="prev", k=3, tokenizer=tokenizer)

def ctx_doc(doc_id, i):
    return build_context(_docs[doc_id], i, mode="doc", tokenizer=tokenizer)

contexts = {
    "none": ctx_none,
    "prev1": ctx_prev1,
    "prev3": ctx_prev3,
    "doc": ctx_doc,
}

list(contexts)

['none', 'prev1', 'prev3', 'doc']

In [22]:
# Compute per-sentence UID deltas
rows = []
for sid, doc_id, i, passive_text, active_text in sample:
    for ctx_name, ctx_fn in contexts.items():
        ctx = ctx_fn(doc_id, i)
        uid_p = uid_mean(passive_text, ctx)
        uid_a = uid_mean(active_text, ctx)
        rows.append({
            "sid": sid,
            "context": ctx_name,
            "uid_passive": uid_p,
            "uid_active": uid_a,
            "delta_active_minus_passive": uid_a - uid_p,
        })

len(rows)

Token indices sequence length is longer than the specified maximum sequence length for this model (1150 > 1024). Running this sequence through the model will result in indexing errors


80

In [23]:
# Summary table (means across the 20 samples)
summary = (
    pd.DataFrame(rows)
    .groupby("context", as_index=False)
    .agg(
        n=("sid", "count"),
        mean_uid_passive=("uid_passive", "mean"),
        mean_uid_active=("uid_active", "mean"),
        mean_delta=("delta_active_minus_passive", "mean"),
    )
)

summary.sort_values("context").reset_index(drop=True)

Unnamed: 0,context,n,mean_uid_passive,mean_uid_active,mean_delta
0,doc,20,4.672989,5.372869,0.69988
1,none,20,5.651854,6.337863,0.686009
2,prev1,20,5.16452,5.879958,0.715439
3,prev3,20,4.938602,5.636504,0.697902


In [24]:
# Random sample of 50 passives
import random

random.seed(7)

passive_sents_50 = []
for sid, (doc_id, i, sent) in _idx.items():
    try:
        ps = PassiveSentence(sent)
    except ValueError:
        continue
    passive_text = sent.metadata.get("text") or " ".join(t["form"] for t in sent if isinstance(t.get("id"), int))
    active_text = ps.depassivize().text
    passive_sents_50.append((sid, doc_id, i, passive_text, active_text))

sample_50 = random.sample(passive_sents_50, k=min(50, len(passive_sents_50)))
len(sample_50)

50

In [25]:
# Reuse context builders: ctx_none, ctx_prev1, ctx_prev3, ctx_doc
contexts_50 = {
    "none": ctx_none,
    "prev1": ctx_prev1,
    "prev3": ctx_prev3,
    "doc": ctx_doc,
}

list(contexts_50)

['none', 'prev1', 'prev3', 'doc']

In [26]:
# Compute per-sentence UID deltas (n=50)
rows_50 = []
for sid, doc_id, i, passive_text, active_text in sample_50:
    for ctx_name, ctx_fn in contexts_50.items():
        ctx = ctx_fn(doc_id, i)
        uid_p = uid_mean(passive_text, ctx)
        uid_a = uid_mean(active_text, ctx)
        rows_50.append({
            "sid": sid,
            "context": ctx_name,
            "uid_passive": uid_p,
            "uid_active": uid_a,
            "delta_active_minus_passive": uid_a - uid_p,
        })

len(rows_50)

200

In [27]:
# Summary table (means across the 50 samples)
summary_50 = (
    pd.DataFrame(rows_50)
    .groupby("context", as_index=False)
    .agg(
        n=("sid", "count"),
        mean_uid_passive=("uid_passive", "mean"),
        mean_uid_active=("uid_active", "mean"),
        mean_delta=("delta_active_minus_passive", "mean"),
    )
)

summary_50.sort_values("context").reset_index(drop=True)

Unnamed: 0,context,n,mean_uid_passive,mean_uid_active,mean_delta
0,doc,50,4.724581,5.508554,0.783973
1,none,50,5.61497,6.434327,0.819357
2,prev1,50,5.247683,6.05416,0.806478
3,prev3,50,4.993903,5.760218,0.766315
