In [2]:

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2Tokenizer
from fancy_einsum import einsum

#### GPT2

In [3]:
model_name = "gpt2-medium"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
token_embeds = model.transformer.wte.weight
value_vectors = torch.cat(
    [
        model.transformer.h[layer_idx].mlp.c_proj.weight
        for layer_idx in range(model.config.num_hidden_layers)
    ],
    dim=0,
)
print(value_vectors.shape)

torch.Size([98304, 1024])


In [5]:

seed_token_pos = ["happy", "joy", " happy", " joy", " smile"]
seed_token_neg = [" sad", " angry", " disgust"]

pos_token_id = [tokenizer.encode(tok)[0] for tok in seed_token_pos]
neg_token_id = [tokenizer.encode(tok)[0] for tok in seed_token_neg]

print(pos_token_id)
print(neg_token_id)

pos_embed = token_embeds[pos_token_id].mean(dim=0)
neg_embed = token_embeds[neg_token_id].mean(dim=0)

[34191, 2633, 3772, 8716, 8212]
[6507, 7954, 16234]


In [6]:
def unembed_to_text(vector, model, tokenizer, k=10):
    norm = model.transformer.ln_f
    lm_head = model.lm_head.weight
    dots = einsum("vocab d_model, d_model -> vocab", lm_head, norm(vector))
    top_k = dots.topk(k).indices
    return tokenizer.batch_decode(top_k, skip_special_tokens=True)

In [7]:

k = 20
norm = model.transformer.ln_f

target_vec = pos_embed - neg_embed
dot_prods = einsum("value_vecs d_model, d_model -> value_vecs", norm(value_vectors), target_vec)
top_value_vecs = dot_prods.topk(k).indices
for vec_idx in top_value_vecs:
    print(f"Value vec: Layer {vec_idx // 1024}, index {vec_idx % 1024}")
    print(unembed_to_text(value_vectors[vec_idx], model, tokenizer))

Value vec: Layer 80, index 988
[' secure', ' successful', ' successfully', ' satisfactory', ' optimal', ' improved', ' optim', ' perfected', ' excellent', ' efficient']
Value vec: Layer 67, index 136
[' peaceful', ' stable', ' satisfactory', ' good', ' trustworthy', ' safe', ' reassured', ' credibility', 'Safe', ' impartial']
Value vec: Layer 54, index 386
[' positives', ' advant', ' blessed', ' mirac', ' upl', ' pristine', ' bright', ' smiles', ' buoy', ' boon']
Value vec: Layer 71, index 367
[' Congratulations', 'osponsors', 'Congratulations', ' Honor', 'aug', ' enriched', ' Excellence', ' rewarded', ' Blossom', ' Celebr']
Value vec: Layer 60, index 301
[' collaborations', ' achievements', ' excellence', ' breakthrough', ' Inspired', ' Excellence', ' inspired', ' Aub', ' amazing', ' Citation']
Value vec: Layer 93, index 414
['hari', ' externalToEVAOnly', 'perm', 'ifty', 'kin', ' Atomic', 'ourced', 'pac', '=-=-=-=-=-=-=-=-', 'allows']
Value vec: Layer 54, index 440
[' unaffected', ' a

In [8]:


target_vec = neg_embed - pos_embed

dot_prods = einsum("value_vecs d_model, d_model -> value_vecs", norm(value_vectors), target_vec)
top_value_vecs = dot_prods.topk(k).indices

for vec_idx in top_value_vecs:
    print(f"Value vec: Layer {vec_idx // 1024}, index {vec_idx % 1024}")
    print(unembed_to_text(value_vectors[vec_idx], model, tokenizer))

Value vec: Layer 70, index 905
[' hate', ' hated', ' negativity', ' bad', ' dreaded', ' harmful', ' adversaries', ' enemies', ' harsh', 'enemy']
Value vec: Layer 55, index 142
[' burdens', ' troubled', ' risks', ' misfortune', ' headache', ' trouble', ' risk', ' nightmare', ' adverse', ' toxicity']
Value vec: Layer 69, index 567
[' inability', ' failed', ' unable', ' inadequate', ' lack', ' failing', ' lacking', ' failure', ' insufficient', ' fail']
Value vec: Layer 68, index 443
[' burdens', ' worst', ' worse', ' toxic', ' humiliating', ' waste', ' nightmare', ' pests', ' wasting', ' protracted']
Value vec: Layer 64, index 974
[' inappropriately', ' prejud', ' unnecessarily', ' improperly', ' unchecked', ' incorrectly', ' inefficient', ' miscon', ' arrogance', ' excessively']
Value vec: Layer 81, index 762
[' problems', ' malfunction', ' failure', ' failures', ' damage', ' woes', ' dysfunction', ' trouble', ' injuries', ' damaged']
Value vec: Layer 65, index 619
[' inability', ' ineff

#### Llama3

In [9]:
model_name = "meta-llama/Llama-3.1-8B"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
token_embeds = model.model.embed_tokens.weight
value_vectors = torch.cat(
    [
        model.model.layers[layer_idx].mlp.down_proj.weight.T
        for layer_idx in range(model.config.num_hidden_layers)
    ],
    dim=0,
)
print(value_vectors.shape)

torch.Size([458752, 4096])


In [14]:
seed_token_pos = ["happy", "joy", " happy", " joy", " smile"]
seed_token_neg = [" sad", " angry", " disgust"]

pos_token_id = [
    tokenizer(tok, add_special_tokens=False)["input_ids"][0]
    for tok in seed_token_pos
    if len(tokenizer(tok, add_special_tokens=False)["input_ids"]) == 1
]

neg_token_id = [
    tokenizer(tok, add_special_tokens=False)["input_ids"][0]
    for tok in seed_token_neg
    if len(tokenizer(tok, add_special_tokens=False)["input_ids"]) == 1
]

print("Positive token IDs:", pos_token_id)
print("Negative token IDs:", neg_token_id)

pos_embed = token_embeds[pos_token_id].mean(dim=0)
neg_embed = token_embeds[neg_token_id].mean(dim=0)


Positive token IDs: [57621, 4215, 6380, 16267, 15648]
Negative token IDs: [12703, 19021, 68162]


In [19]:
def unembed_to_text(vector, model, tokenizer, k=10):
    norm = model.model.norm  
    lm_head = model.lm_head.weight

    normed_vector = norm(vector.unsqueeze(0)).squeeze(0)  # shape: [d_model]

    dots = torch.einsum("vd,d->v", lm_head, normed_vector)

    top_k = dots.topk(k).indices
    return tokenizer.batch_decode(top_k, skip_special_tokens=True)


In [20]:
k = 20
norm = model.model.norm  

pos_vec = pos_embed - neg_embed
dot_prods = torch.einsum("nd,d->n", norm(value_vectors), pos_vec)
top_value_vecs = dot_prods.topk(k).indices

for vec_idx in top_value_vecs:
    print(f"Value vec: Layer {vec_idx // 4096}, index {vec_idx % 4096}")
    print(unembed_to_text(value_vectors[vec_idx], model, tokenizer))


Value vec: Layer 3, index 1403
['ryn', ' Zaman', 'tha', 'rlen', 'lem', ' Major', ' Kaf', 'inventory', 'indexed', ' Lem']
Value vec: Layer 1, index 2835
[' Karlov', ' Tay', 'osit', 'he', 've', 'οκ', 'avic', 'ashed', 'LOWER', ' intersections']
Value vec: Layer 3, index 2045
['achines', '.GetBytes', ' Nas', 'estone', 'mav', 'ses', '날', ' convention', 'emma', '-faced']
Value vec: Layer 24, index 29
['ummings', 'ovny', '伏', 'aeda', 'μον', 'ST', 'alus', 'บาย', ' surrogate', 'hire']
Value vec: Layer 1, index 1775
['orie', 'iens', 'ebo', 'oria', 'ORIA', 'ADO', ' yans', '646', 'agu', 'ado']
Value vec: Layer 5, index 2916
['ONY', 'ony', ' Carnegie', '徒', ' deb', 'лю', ' Duty', ' Toledo', '楽', 'ut']
Value vec: Layer 85, index 262
[' Cous', 'code', 'dam', '862', '775', 'aine', 'exact', ' dam', ' Biom', 'rene']
Value vec: Layer 56, index 2225
['igs', 'ably', 'apr', 'fully', '頼', ' Carol', 'ardi', 'asley', ' Leone', '160']
Value vec: Layer 67, index 1922
['fav', ' boyc', ' setCurrent', ' �', ' adopt

In [21]:
print(unembed_to_text(pos_vec, model, tokenizer))

['FullScreen', ' Erd', 'Parms', ' rich', ' Lester', 'iani', 'Uvs', 'aty', ' Richards', 'ongs']


In [22]:
k = 20
norm = model.model.norm  

neg_vec = neg_embed - pos_embed 
dot_prods = torch.einsum("nd,d->n", norm(value_vectors), neg_vec)
top_value_vecs = dot_prods.topk(k).indices

for vec_idx in top_value_vecs:
    print(f"Value vec: Layer {vec_idx // 4096}, index {vec_idx % 4096}")
    print(unembed_to_text(value_vectors[vec_idx], model, tokenizer))


Value vec: Layer 62, index 1457
['dou', '_behavior', 'pine', ' Race', 'peq', 'بان', 'StartPosition', 'arna', 'odata', 'hti']
Value vec: Layer 0, index 3581
['endale', 'LocalizedString', '�', 'imli', 'abella', 'abric', 'еди', 'endon', 'ogue', 'gre']
Value vec: Layer 26, index 1533
['REFER', '.addObject', ' pokoj', 'viders', 'quee', 'otte', 'prs', 'UNET', ' Dalton', 'kehr']
Value vec: Layer 7, index 1705
['�', 'tat', 'iginal', 'earn', 'erif', 'phin', ' Conway', 'ropol', 'MMdd', 'aña']
Value vec: Layer 66, index 3597
['クセ', 'weeney', 'antu', ' eğ', '?><?', 'ometr', 'uhan', 'ンバ', 'アー', 'chl']
Value vec: Layer 28, index 564
['筋', ' haus', ' Haw', 'ello', 'atu', '233', 'ore', ' priv', 'πο', ' presidency']
Value vec: Layer 33, index 2083
['pom', ' discret', '선', 'aram', 'arme', 'ltk', 'WF', 'ilha', 'dej', 'αρ']
Value vec: Layer 105, index 2332
['ột', 'apat', 'ős', ' bumped', ' Parcel', 'chas', 'appe', ' mpg', ' Cal', ' gep']
Value vec: Layer 8, index 2709
['riott', 'ンティ', '[section', 'avel', 

In [23]:
print(unembed_to_text(neg_vec, model, tokenizer))

['/desktop', 'oton', 'bourne', 'oph', 'ine', ' чин', '981', 'ount', ' Duel', 'orp']


#### Gemma2

In [24]:
model_name = "google/gemma-2-2b"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [31]:
token_embeds = model.model.embed_tokens.weight
value_vectors = torch.cat(
    [
        model.model.layers[layer_idx].mlp.down_proj.weight.T
        for layer_idx in range(model.config.num_hidden_layers)
    ],
    dim=0,
)
print(value_vectors.shape)

torch.Size([239616, 2304])


In [32]:
seed_token_pos = ["happy", "joy", " happy", " joy", " smile"]
seed_token_neg = [" sad", " angry", " disgust"]

pos_token_id = [
    tokenizer(tok, add_special_tokens=False)["input_ids"][0]
    for tok in seed_token_pos
    if len(tokenizer(tok, add_special_tokens=False)["input_ids"]) == 1
]

neg_token_id = [
    tokenizer(tok, add_special_tokens=False)["input_ids"][0]
    for tok in seed_token_neg
    if len(tokenizer(tok, add_special_tokens=False)["input_ids"]) == 1
]

print("Positive token IDs:", pos_token_id)
print("Negative token IDs:", neg_token_id)

pos_embed = token_embeds[pos_token_id].mean(dim=0)
neg_embed = token_embeds[neg_token_id].mean(dim=0)


Positive token IDs: [11896, 3375, 4915, 10300, 11491]
Negative token IDs: [9270, 19456, 41497]


In [33]:
def unembed_to_text(vector, model, tokenizer, k=10):
    norm = model.model.norm  
    lm_head = model.lm_head.weight

    normed_vector = norm(vector.unsqueeze(0)).squeeze(0)  # shape: [d_model]

    dots = torch.einsum("vd,d->v", lm_head, normed_vector)

    top_k = dots.topk(k).indices
    return tokenizer.batch_decode(top_k, skip_special_tokens=True)


In [34]:
k = 20
norm = model.model.norm  

pos_vec = pos_embed - neg_embed
dot_prods = torch.einsum("nd,d->n", norm(value_vectors), pos_vec)
top_value_vecs = dot_prods.topk(k).indices

for vec_idx in top_value_vecs:
    print(f"Value vec: Layer {vec_idx // 4096}, index {vec_idx % 4096}")
    print(unembed_to_text(value_vectors[vec_idx], model, tokenizer))


Value vec: Layer 54, index 3627
[' happy', ' Happy', 'Happy', 'happy', ' HAPPY', 'HAPPY', ' happiest', ' happier', ' happiness', ' satisfied']
Value vec: Layer 49, index 3230
[' happy', 'happy', ' Happy', 'Happy', ' HAPPY', 'HAPPY', ' happier', ' happiest', ' happily', ' feliz']
Value vec: Layer 47, index 192
[' happy', 'happy', ' happiness', ' sad', ' unhappy', ' Happy', ' HAPPY', ' sadness', ' happiest', ' unhappiness']
Value vec: Layer 51, index 3146
[' happy', ' joyful', ' joy', ' rejoice', ' cheerful', ' happiness', ' joyous', ' celebratory', ' pleasant', ' rejoicing']
Value vec: Layer 41, index 3916
[' Happy', 'Happy', ' happy', ' HAPPY', 'happy', 'HAPPY', ' feliz', ' happier', ' happiest', ' Feliz']
Value vec: Layer 56, index 2644
[' enjoyment', ' Enjoy', ' enjoy', 'enjoy', 'Enjoy', ' ENJOY', ' enjoyed', ' enjoying', ' enjoyable', ' pleasure']
Value vec: Layer 40, index 3812
[' enjoyment', ' pleasure', ' Enjoy', ' enjoy', ' enjoyable', 'enjoy', 'Enjoy', ' ENJOY', ' Pleasure', ' 

In [35]:
k = 20
norm = model.model.norm  

neg_vec = neg_embed - pos_embed 
dot_prods = torch.einsum("nd,d->n", norm(value_vectors), neg_vec)
top_value_vecs = dot_prods.topk(k).indices

for vec_idx in top_value_vecs:
    print(f"Value vec: Layer {vec_idx // 4096}, index {vec_idx % 4096}")
    print(unembed_to_text(value_vectors[vec_idx], model, tokenizer))


Value vec: Layer 42, index 3907
['aarrggbb', ' Dalio', 'VersionUID', ' ✭✭', 'IsMutable', ' ComVisible', ' AssemblyProduct', ' AFB', 'optString', 'uxxxx']
Value vec: Layer 43, index 3726
[' GenerationType', 'rotech', ' AssemblyCulture', '::$_', ' BoxDecoration', 'GenerationType', ' Auk', 'ereço', 'Artem', 'endphp']
Value vec: Layer 44, index 1954
[' Vikipedi', 'balleur', ' تضيفلها', ' lenker', 'UnknownFieldSet', 'thâu', 'ajara', ' Loh', 'nestjs', 'LVANIA']
Value vec: Layer 39, index 2649
[' mica', ' curio', ' oignon', ' cin', ' McIn', 'weiler', 'Интере', ' Curious', 'Quo', '!*\\']
Value vec: Layer 39, index 4088
['extAlignment', ' NgModule', 'GenerationType', ' bidra', 'EndContext', 'int', ' SqlCommand', ' degradability', 'Thing', 'IContainer']
Value vec: Layer 56, index 25
['bParam', ' تضيفلها', 'openzeppelin', 'tvguidetime', ' Pala', 'reactivex', 'ArrowToggle', ' creen', 'twimg', '@",']
Value vec: Layer 43, index 2877
[' sī', 'ритори', 'bootstrapcdn', '=>$', 'enumii', ' AssemblyTitle'

In [36]:
print(unembed_to_text(pos_vec, model, tokenizer))
print(unembed_to_text(neg_vec, model, tokenizer))

[' happy', 'happy', ' joy', ' Happy', 'Happy', 'joy', ' HAPPY', 'HAPPY', ' happiness', ' smile']
['Dom', 'astic', ' distin', ' Dom', 'dom', ' dom', 'DOM', ' DOM', ' distinctive', ' distingu']


#### Mistral

In [3]:
model_name = "mistralai/Mistral-7B-v0.1"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
token_embeds = model.model.embed_tokens.weight
value_vectors = torch.cat(
    [
        model.model.layers[layer_idx].mlp.down_proj.weight.T
        for layer_idx in range(model.config.num_hidden_layers)
    ],
    dim=0,
)
print(value_vectors.shape)

torch.Size([458752, 4096])


In [6]:
seed_token_pos = ["happy", "joy", " happy", " joy", " smile"]
seed_token_neg = [" sad", " angry", " disgust"]

pos_token_id = [
    tokenizer(tok, add_special_tokens=False)["input_ids"][0]
    for tok in seed_token_pos
    if len(tokenizer(tok, add_special_tokens=False)["input_ids"]) == 1
]

neg_token_id = [
    tokenizer(tok, add_special_tokens=False)["input_ids"][0]
    for tok in seed_token_neg
    if len(tokenizer(tok, add_special_tokens=False)["input_ids"]) == 1
]

print("Positive token IDs:", pos_token_id)
print("Negative token IDs:", neg_token_id)

pos_embed = token_embeds[pos_token_id].mean(dim=0)
neg_embed = token_embeds[neg_token_id].mean(dim=0)


Positive token IDs: [4610, 10186, 4610, 10186, 6458]
Negative token IDs: [7456, 10545, 21536]


In [7]:
def unembed_to_text(vector, model, tokenizer, k=10):
    norm = model.model.norm  
    lm_head = model.lm_head.weight

    normed_vector = norm(vector.unsqueeze(0)).squeeze(0)  # shape: [d_model]

    dots = torch.einsum("vd,d->v", lm_head, normed_vector)

    top_k = dots.topk(k).indices
    return tokenizer.batch_decode(top_k, skip_special_tokens=True)


In [8]:
k = 20
norm = model.model.norm  

pos_vec = pos_embed - neg_embed
dot_prods = torch.einsum("nd,d->n", norm(value_vectors), pos_vec)
top_value_vecs = dot_prods.topk(k).indices

for vec_idx in top_value_vecs:
    print(f"Value vec: Layer {vec_idx // 4096}, index {vec_idx % 4096}")
    print(unembed_to_text(value_vectors[vec_idx], model, tokenizer))


Value vec: Layer 5, index 749
['당', 'anza', 'führ', 'enas', 'mals', 'acz', 'amas', 'agi', 'arin', 'atos']
Value vec: Layer 2, index 2113
['pty', 'eras', 'ea', 'atique', 'Oz', 'sop', 'stag', 'onne', 'lug', 'ante']
Value vec: Layer 1, index 1075
['smile', 'smiling', 'smiled', 'smiles', 'grin', 'reed', 'Sm', 'fx', 'lish', 'undo']
Value vec: Layer 20, index 3217
['happiness', 'joy', 'joy', 'smiles', 'happ', 'eb', 'happy', 'anda', 'smile', 'happily']
Value vec: Layer 2, index 800
['usk', 'inton', 'PARTICULAR', 'loyd', 'hire', 'appy', 'ERCHANT', 'sung', 'enth', 'gom']
Value vec: Layer 0, index 2453
['heimer', 'eno', 'ws', 'ERN', 'ENO', 'Franc', 'shipped', 'uelle', 'ou', 'owi']
Value vec: Layer 7, index 1075
['joy', 'joy', 'Joy', 'borg', 'Dru', 'ris', 'eria', 'eres', 'prem', 'owe']
Value vec: Layer 0, index 345
['pleasure', 'éri', 'printStackTrace', 'allo', 'GRO', 'iba', 'unexpected', 'println', 'har', 'Kevin']
Value vec: Layer 16, index 2478
['eph', 'Agency', 'cheek', 'stor', 'colo', 'colour

In [11]:
k = 20
norm = model.model.norm  

neg_vec = neg_embed - pos_embed 
dot_prods = torch.einsum("nd,d->n", norm(value_vectors), neg_vec)
top_value_vecs = dot_prods.topk(k).indices

for vec_idx in top_value_vecs:
    print(f"Value vec: Layer {vec_idx // 4096}, index {vec_idx % 4096}")
    print(unembed_to_text(value_vectors[vec_idx], model, tokenizer))


Value vec: Layer 3, index 1096
['conting', 'implicit', 'endorse', 'ague', 'powered', 'swe', 'dist', '력', 'па', '\\_']
Value vec: Layer 3, index 1257
['iera', 'même', 'mutable', 'iesa', 'ele', 'Tube', 'havet', 'chter', 'Lad', 'nero']
Value vec: Layer 11, index 3465
['iri', 'vir', 'digest', 'Bes', 'honey', 'саве', 'cold', 'alth', 'Grand', 'akten']
Value vec: Layer 9, index 3089
['лове', 'yle', '�', 'Cape', 'Sec', 'vers', 'routine', 'mann', 'uch', 'xE']
Value vec: Layer 21, index 1881
['Castle', 'cola', 'onna', 'ieron', 'Bast', 'uum', 'odio', 'шен', 'ikel', 'preview']
Value vec: Layer 53, index 1957
['aude', 'Tol', 'aw', 'caused', 'ASS', 'awi', 'auf', 'ond', 'MAGES', 'climb']
Value vec: Layer 14, index 917
['occupied', 'Buck', 'hint', 'lazy', 'ilt', 'uso', 'abled', 'cure', 'PC', 'geldig']
Value vec: Layer 0, index 2461
['come', 'ton', 'egründ', 'cov', 'début', 'commence', 'deeply', 'iom', 'intro', 'Haupt']
Value vec: Layer 1, index 1384
['passage', 'ív', 'spell', 'spell', 'bread', 'ńst', 

In [12]:
print(unembed_to_text(pos_vec, model, tokenizer))
print(unembed_to_text(neg_vec, model, tokenizer))

['/******/', 'acci', 'anco', '干', 'XR', 'ISH', 'aco', 'charg', 'XFF', 'brace']
['ronic', 'fruit', 'Bron', '体', 'final', 'ron', 'ret', 'flesh', 'pson', 'Pear']
